openxla
diff --git a/‎lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp‎
Lines changed: 76 additions & 17 deletions b/‎lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp‎
Lines changed: 76 additions & 17 deletions
diff --git a/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 105 additions & 17 deletions b/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 105 additions & 17 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp‎
Lines changed: 2 additions & 4 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp‎
Lines changed: 8 additions & 5 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎test/Conversion/allocate_warp_groups.mlir‎
Lines changed: 10 additions & 6 deletions b/‎test/Conversion/allocate_warp_groups.mlir‎
Lines changed: 10 additions & 6 deletions
@@ -11,14 +11,89 @@ using namespace mlir;
 using namespace mlir::triton;
 using namespace mlir::triton::gpu;
 
+// Given a `ttg.warp_specialize` with a certain number of existing warps, pad it
+// with extra warps until it has the same number of full warp groups as the
+// largest partitioning. This ensures that all threads can be present to
+// surrender registers.
+static void padToMaxWarpGroups(WarpSpecializeOp op, int numExtraWarpGroups) {
+  int numExtraWarps = op.getTotalPartitionWarps();
+  int warpsToAdd = numExtraWarpGroups * 4 - numExtraWarps;
+  assert(warpsToAdd >= 0);
+
+  // Fill it with powers of 2.
+  SmallVector<int> paddingPartitionSizes;
+  while (warpsToAdd > 0) {
+    int paddingSize = llvm::NextPowerOf2(warpsToAdd) / 2;
+    paddingPartitionSizes.push_back(paddingSize);
+    warpsToAdd -= paddingSize;
+  }
+
+  auto partitions = cast<WarpSpecializePartitionsOp>(
+      op.getPartitionOpHolder().front().front());
+  OperationState state(partitions.getLoc(), partitions.getOperationName());
+  for (Region *region : partitions.getRegions())
+    state.addRegion()->takeBody(*region);
+
+  SmallVector<int32_t> partitionNumWarps(op.getPartitionNumWarps());
+  for (int paddingSize : paddingPartitionSizes) {
+    partitionNumWarps.push_back(paddingSize);
+
+    Block &body = state.addRegion()->emplaceBlock();
+    for (Value capture : op.getExplicitCaptures())
+      body.addArgument(capture.getType(), capture.getLoc());
+    OpBuilder b(op.getContext());
+    b.setInsertionPointToStart(&body);
+    b.create<WarpReturnOp>(op.getLoc());
+  }
+  op.setPartitionNumWarps(partitionNumWarps);
+
+  // Set the requested registers to low for the padded partitions that do
+  // nothing.
+  if (auto reqRegs = op.getRequestedRegisters()) {
+    SmallVector<int32_t> newReqRegs(*reqRegs);
+    newReqRegs.append(paddingPartitionSizes.size(), 16);
+    op.setRequestedRegisters(newReqRegs);
+  }
+
+  OpBuilder b(partitions);
+  b.create(state);
+  partitions.erase();
+}
+
 namespace {
 struct AllocateWarpGroups
     : public mlir::triton::gpu::impl::TritonGPUAllocateWarpGroupsBase<
           AllocateWarpGroups> {
   void runOnOperation() override {
     ModuleOp mod = getOperation();
 
+    // First determine the maximum number of extra warps.
+    int maxExtraWarps = 0;
+    mod.walk([&](WarpSpecializeOp op) {
+      maxExtraWarps = std::max<int>(maxExtraWarps, op.getTotalPartitionWarps());
+    });
+
+    // Round this up to the nearest warpgroup (multiple of 4) and then pad each
+    // `ttg.warp_specialize` to the nearest warpgroup.
+    int numExtraWarpGroups = llvm::divideCeil(maxExtraWarps, 4);
+    mod.walk([&](WarpSpecializeOp op) {
+      padToMaxWarpGroups(op, numExtraWarpGroups);
+    });
+
+    // Determine the maximum number of registers per thread. This may have
+    // been set by the user.
     int threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod);
+    int baseNumWarps = lookupNumWarps(mod);
+    int maxnreg;
+    if (auto maxnregAttr =
+            mod->getAttrOfType<IntegerAttr>(AttrMaxRegistersName)) {
+      maxnreg = maxnregAttr.getInt();
+    } else {
+      // Assume the user wants to use all 64K registers.
+      maxnreg = (64 * 1024) / (baseNumWarps + numExtraWarpGroups * 4) /
+                threadsPerWarp;
+      maxnreg = maxnreg / 8 * 8;
+    }
 
     struct WarpGroupInfo {
       SmallVector<Region *> partitions;
@@ -33,12 +108,8 @@ struct AllocateWarpGroups
     };
 
     // Compute the total number of warps required at any given time.
-    int baseNumWarps = lookupNumWarps(mod);
-    int maxExtraWarps = 0;
     mod.walk([&](WarpSpecializeOp op) {
       ArrayRef<int32_t> arr = op.getPartitionNumWarps();
-      int req = op.getTotalPartitionWarps();
-      maxExtraWarps = std::max(maxExtraWarps, req);
 
       // Allocate the start IDs such that the largest warpgroups have lower
       // starting warp IDs.
@@ -85,18 +156,6 @@ struct AllocateWarpGroups
         warpGroups.back().numWarps += numWarps;
       }
 
-      // Determine the maximum number of registers per thread. This may have
-      // been set by the user.
-      int maxnreg;
-      if (auto maxnregAttr =
-              op->getAttrOfType<IntegerAttr>(AttrMaxRegistersName)) {
-        maxnreg = maxnregAttr.getInt();
-      } else {
-        maxnreg = (1 << 16) / (baseNumWarps + op.getTotalPartitionWarps()) /
-                  threadsPerWarp;
-        maxnreg = maxnreg / 8 * 8;
-      }
-
       // Compute the register deficit over the partition warp groups.
       int registerDeficit = 0;
       for (const WarpGroupInfo &wg : warpGroups) {
@@ -135,7 +194,7 @@ struct AllocateWarpGroups
 
     Builder b(&getContext());
     mod->setAttr("ttg.total-num-warps",
-                 b.getI32IntegerAttr(baseNumWarps + maxExtraWarps));
+                 b.getI32IntegerAttr(baseNumWarps + numExtraWarpGroups * 4));
   }
 };
 } // namespace
@@ -25,6 +25,7 @@ namespace {
 using namespace mlir;
 using namespace mlir::triton;
 using namespace mlir::triton::gpu;
+namespace ttng = triton::nvidia_gpu;
 
 // pass named attrs (e.g., tt.contiguity) from Triton to Triton
 static void addNamedAttrs(Operation *op, DictionaryAttr dictAttrs) {
@@ -466,6 +467,72 @@ struct GatherScatterOpPattern : public OpConversionPattern<OpT> {
   }
 };
 
+// Given a tensor and its representation in tensor memory, determine its
+// distributed layout.
+static RankedTensorType getTMEMTensorLayout(const TypeConverter *tc,
+                                            RankedTensorType type,
+                                            MemDescType memdesc,
+                                            unsigned numWarps) {
+  Attribute encoding;
+  type = cast<RankedTensorType>(tc->convertType(type));
+  if (isa<ttng::TensorMemoryScalesEncodingAttr>(memdesc.getEncoding())) {
+    encoding = LinearEncodingAttr::get(
+        type.getContext(), getScaleTMEMStoreLinearLayout(type, numWarps));
+  } else {
+    auto tmemEnc = cast<ttng::TensorMemoryEncodingAttr>(memdesc.getEncoding());
+    encoding = ttng::getTmemCompatibleLayout(
+        tmemEnc.getBlockM(), tmemEnc.getBlockN(), type, numWarps);
+  }
+  return RankedTensorType::get(type.getShape(), type.getElementType(),
+                               encoding);
+}
+
+struct TMEMLoadOpPattern : public OpConversionPattern<ttng::TMEMLoadOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttng::TMEMLoadOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    RankedTensorType type = getTMEMTensorLayout(
+        typeConverter, op.getType(), op.getSrc().getType(), lookupNumWarps(op));
+    rewriter.modifyOpInPlace(op, [&] { op.getResult().setType(type); });
+    return success();
+  }
+};
+
+struct TMEMStoreOpPattern : public OpConversionPattern<ttng::TMEMStoreOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttng::TMEMStoreOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    RankedTensorType type =
+        getTMEMTensorLayout(typeConverter, op.getSrc().getType(),
+                            op.getDst().getType(), lookupNumWarps(op));
+    Value src =
+        rewriter.create<ConvertLayoutOp>(op.getLoc(), type, adaptor.getSrc());
+    rewriter.modifyOpInPlace(op, [&] { op.getSrcMutable().assign(src); });
+    return success();
+  }
+};
+
+struct TMEMAllocOpPattern : public OpConversionPattern<ttng::TMEMAllocOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttng::TMEMAllocOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (!op.getSrc())
+      return success();
+    RankedTensorType type = getTMEMTensorLayout(
+        typeConverter, op.getSrc().getType(), op.getType(), lookupNumWarps(op));
+    Value src =
+        rewriter.create<ConvertLayoutOp>(op.getLoc(), type, adaptor.getSrc());
+    rewriter.modifyOpInPlace(op, [&] { op.getSrcMutable().assign(src); });
+    return success();
+  }
+};
+
 struct TritonTransPattern : public OpConversionPattern<TransOp> {
   using OpConversionPattern::OpConversionPattern;
 
@@ -592,40 +659,61 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
   MLIRContext *context = patterns.getContext();
   patterns.insert< // TODO: view should have custom pattern that views the
                    // layout
+      // clang-format off
       GenericOpPattern<triton::AdvanceOp>,
       GenericOpPattern<triton::MakeTensorPtrOp>,
-      GenericOpPattern<triton::ReshapeOp>, GenericOpPattern<triton::BitcastOp>,
-      GenericOpPattern<triton::FpToFpOp>, GenericOpPattern<triton::IntToPtrOp>,
-      GenericOpPattern<triton::PtrToIntOp>, GenericOpPattern<triton::SplatOp>,
-      TritonBroadcastPattern, GenericOpPattern<triton::AddPtrOp>,
-      TritonCatPattern, TritonJoinOpPattern, TritonSplitOpPattern,
+      GenericOpPattern<triton::ReshapeOp>,
+      GenericOpPattern<triton::BitcastOp>,
+      GenericOpPattern<triton::FpToFpOp>,
+      GenericOpPattern<triton::IntToPtrOp>,
+      GenericOpPattern<triton::PtrToIntOp>,
+      GenericOpPattern<triton::SplatOp>,
+      GenericOpPattern<triton::AddPtrOp>,
+      TritonBroadcastPattern,
+      TritonCatPattern,
+      TritonJoinOpPattern,
+      TritonSplitOpPattern,
       GenericOpPattern<triton::ClampFOp>,
       GenericOpPattern<triton::PreciseSqrtOp>,
       GenericOpPattern<triton::PreciseDivFOp>,
       GenericOpPattern<triton::MulhiUIOp>,
-      GenericOpPattern<triton::ElementwiseInlineAsmOp>, TritonReducePattern,
-      GenericOpPattern<triton::ReduceReturnOp>, TritonScanPattern,
+      GenericOpPattern<triton::ElementwiseInlineAsmOp>,
+      TritonReducePattern,
+      GenericOpPattern<triton::ReduceReturnOp>,
+      TritonScanPattern,
       GenericOpPattern<triton::ScanReturnOp>,
-      GenericOpPattern<triton::MakeRangeOp>, TritonExpandDimsPattern,
-      TritonTransPattern, TritonDotPattern,
+      GenericOpPattern<triton::MakeRangeOp>,
+      TritonExpandDimsPattern,
+      TritonTransPattern,
+      TritonDotPattern,
       GatherScatterOpPattern<DescriptorGatherOp>,
       GatherScatterOpPattern<DescriptorScatterOp>,
-      GatherScatterOpPattern<triton::nvidia_gpu::AsyncTMAGatherOp>,
-      GatherScatterOpPattern<triton::nvidia_gpu::AsyncTMAScatterOp>,
-      GenericOpPattern<triton::LoadOp>, GenericOpPattern<triton::StoreOp>,
-      GenericOpPattern<triton::HistogramOp>, GenericOpPattern<triton::GatherOp>,
+      GatherScatterOpPattern<ttng::AsyncTMAGatherOp>,
+      GatherScatterOpPattern<ttng::AsyncTMAScatterOp>,
+      TMEMLoadOpPattern,
+      TMEMStoreOpPattern,
+      TMEMAllocOpPattern,
+      GenericOpPattern<triton::LoadOp>,
+      GenericOpPattern<triton::StoreOp>,
+      GenericOpPattern<triton::HistogramOp>,
+      GenericOpPattern<triton::GatherOp>,
       GenericOpPattern<triton::ExternElementwiseOp>,
-      GenericOpPattern<triton::PrintOp>, GenericOpPattern<triton::AssertOp>,
+      GenericOpPattern<triton::PrintOp>,
+      GenericOpPattern<triton::AssertOp>,
       GenericOpPattern<triton::AtomicCASOp>,
-      GenericOpPattern<triton::AtomicRMWOp>, GenericOpPattern<ReturnOp>,
+      GenericOpPattern<triton::AtomicRMWOp>,
       GenericOpPattern<triton::DescriptorLoadOp>,
       GenericOpPattern<triton::DescriptorStoreOp>,
       GenericOpPattern<triton::DescriptorReduceOp>,
       GenericOpPattern<triton::ExperimentalTensormapCreateOp>,
       GenericOpPattern<triton::ExperimentalTensormapFenceproxyAcquireOp>,
       // this assumes the right layout will be set later for dot scaled.
-      GenericOpPattern<triton::DotScaledOp>, GenericOpPattern<triton::CallOp>,
-      TritonFuncOpPattern>(typeConverter, context);
+      GenericOpPattern<triton::DotScaledOp>,
+      GenericOpPattern<triton::CallOp>,
+      GenericOpPattern<ReturnOp>,
+      TritonFuncOpPattern
+      // clang-format on
+      >(typeConverter, context);
 }
 // Proton patterns
 // NOTE: Because Proton's inputs are scalars and not tensors this conversion
 
@@ -213,10 +213,8 @@ static WarpSchedule getInitialSchedule(const PartitionScheme &scheme) {
       userPartition->insert(userOp);
     // Place the epilogue partition in the default warpgroup. The MMA and load
     // partitions shouldn't have tensor computations in them, which means they
-    // will get assigned just 1 warp each. Add an extra partition to pad the
-    // number of warps to the nearest warpgroup.
-    schedule.addPartition(0);
-    schedule.reorderPartitions({2, 1, 0, 3});
+    // will get assigned just 1 warp each.
+    schedule.reorderPartitions({2, 1, 0});
   }
 
   schedule.updatePartitions();
 
@@ -127,6 +127,7 @@ static LogicalResult relayoutWarps(ModuleAxisInfoAnalysis &axisInfo,
   pm.addPass(createTritonGPUCoalesce());
   pm.addPass(createTritonGPURemoveLayoutConversions());
   pm.addPass(createTritonGPUOptimizeThreadLocality());
+  pm.addPass(createTritonGPUAccelerateMatmul());
   pm.addPass(createTritonGPURemoveLayoutConversions());
   if (failed(runPipeline(pm, *container)))
     return failure();
@@ -192,17 +193,19 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
   SmallVector<int32_t> partitionNumWarps =
       llvm::to_vector(wsOp.getPartitionNumWarps());
 
-  // Some instructions have critical throughput if have low register usage. Make
-  // sure there are enough warps for these ops to execute quickly.
+  // Determine if a partition has a lower limit on the number of warps.
   SmallVector<int32_t> minWarpsForPartition(partitionNumWarps.size(), 1);
   for (auto [minWarps, region] :
        llvm::zip(minWarpsForPartition, wsOp.getPartitionRegions())) {
     region->walk([minWarps = &minWarps](Operation *op) {
-      if (!isa<scf::ForOp>(op->getParentOp()))
-        return;
+      // Some instructions have critical throughput if have low register usage.
+      // Make sure there are enough warps for these ops to execute quickly.
       if (isa<ttng::AsyncTMAGatherOp, ttng::AsyncTMAScatterOp,
               ttng::AsyncTMACopyGlobalToLocalOp>(op))
         *minWarps = 2;
+      // TMEM ops require at least 4 warps to be able to read all lanes.
+      else if (isa<ttng::TMEMLoadOp, ttng::TMEMStoreOp, ttng::TMEMAllocOp>(op))
+        *minWarps = 4;
     });
   }
 
@@ -254,7 +257,7 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
        llvm::zip(wsOp.getPartitionRegions(), partitionNumWarps,
                  wsOp.getPartitionNumWarps(), maxTensorRegs, estRegUsage)) {
     // "Guess" the register usage for each partition.
-    estRegs = tensorRegs ? 80 : 48;
+    estRegs = tensorRegs ? 72 : 24;
 
     // Layouts need to be reassigned if the number of warps changed and there
     // are tensor computations.
 
@@ -6,11 +6,11 @@ module attributes {"ttg.num-warps" = 4 : i32} {
 
 // -----
 
-// CHECK: module attributes {"ttg.num-warps" = 4 : i32, "ttg.total-num-warps" = 17 : i32}
+// CHECK: module attributes {"ttg.num-warps" = 4 : i32, "ttg.total-num-warps" = 20 : i32}
 module attributes {"ttg.num-warps" = 4 : i32} {
 
 tt.func @kernel() {
-  // CHECK: ttg.warp_specialize() attributes {warpGroupStartIds = array<i32: 16, 4, 12>}
+  // CHECK: ttg.warp_specialize() attributes {warpGroupStartIds = array<i32: 18, 4, 12, 16, 19>}
   ttg.warp_specialize()
   default {
     ttg.warp_yield
@@ -24,18 +24,20 @@ tt.func @kernel() {
   partition2() num_warps(4) {
     ttg.warp_return
   } : () -> ()
+  // CHECK: partition3() num_warps(2)
+  // CHECK: partition4() num_warps(1)
   tt.return
 }
 
 }
 
 // -----
 
-// CHECK: module attributes {"ttg.num-warps" = 2 : i32, "ttg.total-num-warps" = 11 : i32}
-module attributes {"ttg.num-warps" = 2 : i32} {
+// CHECK: module attributes {"ttg.num-warps" = 4 : i32, "ttg.total-num-warps" = 16 : i32}
+module attributes {"ttg.num-warps" = 4 : i32} {
 
 tt.func @two_warp_specialize() {
-  // CHECK: ttg.warp_specialize() attributes {warpGroupStartIds = array<i32: 2, 4>}
+  // CHECK: ttg.warp_specialize() attributes {warpGroupStartIds = array<i32: 12, 14, 4, 15>}
   ttg.warp_specialize()
   default {
     ttg.warp_yield
@@ -46,8 +48,10 @@ tt.func @two_warp_specialize() {
   partition1() num_warps(1) {
     ttg.warp_return
   } : () -> ()
+  // CHECK: partition2() num_warps(8)
+  // CHECK: partition3() num_warps(1)
 
-  // CHECK: ttg.warp_specialize() attributes {warpGroupStartIds = array<i32: 10, 2>}
+  // CHECK: ttg.warp_specialize() attributes {warpGroupStartIds = array<i32: 14, 4, 12, 15>}
   ttg.warp_specialize()
   default {
     ttg.warp_yield