intel
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp‎
Lines changed: 8 additions & 4 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 146 additions & 10 deletions b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 146 additions & 10 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp‎
Lines changed: 12 additions & 13 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp‎
Lines changed: 12 additions & 13 deletions
diff --git a/‎lib/Tools/LinearLayout.cpp‎
Lines changed: 7 additions & 1 deletion b/‎lib/Tools/LinearLayout.cpp‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 18 additions & 11 deletions b/‎setup.py‎
Lines changed: 18 additions & 11 deletions
@@ -600,11 +600,13 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
 
   createTMABarrierAndWait(forOp, asyncLoads, loadGroups, schedule);
 
+  bool hasAsyncLoads = false;
   for (auto [op, asyncLoad] : asyncLoads) {
     auto [insertIdx, extractIdx, phase, _] = loadGroups[asyncLoad.stageDiff];
     if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
       createAsyncCopy(forOp, loadOp, asyncLoad.alloc, insertIdx, extractIdx,
                       schedule);
+      hasAsyncLoads = true;
     } else if (auto loadOp = dyn_cast<tt::DescriptorLoadOp>(op)) {
       createTMAAsyncLoad(forOp, loadOp, asyncLoad.alloc, insertIdx, extractIdx,
                          asyncLoad.barrier, asyncLoad.waitOp, schedule);
@@ -628,10 +630,12 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
   // correct stages.
   scheduleDependencies(forOp, schedule);
 
-  // Insert sync point for any possibly outstanding loads after the loop. This
-  // can happen as we speculatively execute loads in the loop.
-  builder.setInsertionPointAfter(forOp);
-  builder.create<ttg::AsyncWaitOp>(loc, ValueRange({}), 0);
+  if (hasAsyncLoads) {
+    // Insert sync point for any possibly outstanding loads after the loop. This
+    // can happen as we speculatively execute loads in the loop.
+    builder.setInsertionPointAfter(forOp);
+    builder.create<ttg::AsyncWaitOp>(loc, ValueRange({}), 0);
+  }
 
   // Make sure all ops have attributes.
   for (Operation &op : forOp.getBody()->without_terminator()) {
 
@@ -1057,6 +1057,40 @@ void LayoutRematerialization::hoistConvertIntoConditionals() {
   }
 }
 
+static bool isExpensiveMathOp(Operation *op) {
+  // These operations are either multiple instructions or have throughput
+  // lower than 16 according to the arithmetic instructions table in:
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions
+  return isa<arith::DivFOp, math::ErfcOp, math::SinhOp, math::CoshOp,
+             math::TanhOp, math::AsinhOp, math::AcoshOp, math::AtanhOp,
+             math::CtPopOp, math::CountLeadingZerosOp,
+             math::CountTrailingZerosOp, math::ExpOp, math::Exp2Op,
+             math::ExpM1Op, math::LogOp, math::Log2Op, math::Log10Op,
+             math::Log1pOp, math::SinOp, math::CosOp, math::TanOp, math::AsinOp,
+             math::AcosOp, math::AtanOp, math::Atan2Op, math::PowFOp,
+             math::SqrtOp, math::RsqrtOp, math::ErfOp, math::CbrtOp>(op);
+}
+
+static int64_t getByteCount(Value result, int64_t minElementCount = 0,
+                            int64_t minBitWidth = 0) {
+  int64_t elementCount = 0;
+  int64_t dtypeBitWidth = 0;
+  if (auto tensorTy = dyn_cast<RankedTensorType>(result.getType())) {
+    elementCount = tensorTy.getNumElements();
+    auto elemType = tensorTy.getElementType();
+    if (elemType.isIntOrFloat()) {
+      dtypeBitWidth = elemType.getIntOrFloatBitWidth();
+    }
+  }
+  if (elementCount < minElementCount) {
+    elementCount = minElementCount;
+  }
+  if (dtypeBitWidth < minBitWidth) {
+    dtypeBitWidth = minBitWidth;
+  }
+  return (elementCount * dtypeBitWidth) >> 3;
+}
+
 void LayoutRematerialization::backwardRematerialization(
     ConvertLayoutOp convertOp) {
   // DotOperand is hoisted by hoistDotOperand
@@ -1088,12 +1122,112 @@ void LayoutRematerialization::backwardRematerialization(
     return;
   }
 
+  // 2. Determine whether rematerialisation is beneficial.
+
+  // Identify all operations in the slice
+  SetVector<Operation *> sliceOps;
+  for (Value v : slice) {
+    if (Operation *op = v.getDefiningOp()) {
+      sliceOps.insert(op);
+    }
+  }
+
+  // Compute single-use operations
+  DenseMap<Operation *, bool> isSingleUse;
+  std::function<bool(Operation *)> isOpSingleUse;
+  isOpSingleUse = [&](Operation *op) -> bool {
+    // lookup in memoization array:
+    auto it = isSingleUse.find(op);
+    if (it != isSingleUse.end()) {
+      return it->second;
+    }
+
+    bool singleUse = true;
+
+    for (Value result : op->getResults()) {
+      for (Operation *user : result.getUsers()) {
+        if (user == convertOp) {
+          continue;
+        }
+        if (sliceOps.contains(user)) {
+          if (!isOpSingleUse(user)) {
+            singleUse = false;
+            break;
+          }
+        } else {
+          singleUse = false;
+          break;
+        }
+      }
+      if (!singleUse) {
+        break;
+      }
+    }
+
+    // insert into memoization array:
+    isSingleUse[op] = singleUse;
+    return singleUse;
+  };
+
+  // Measure the number of bytes that we're manipulating with the
+  // ConvertLayoutOp. We pessimistically assume that we round-trip
+  // through shared memory and that we cannot vectorise sub-register
+  // loads/stores, so we set a minimum element count of 32 (the warp
+  // size and number of shared memory banks) and minimum bitwidth of
+  // 32 (the width per bank of the shared memory load/store unit).
+  int64_t convertLayoutBytes = getByteCount(convertOp.getSrc(), 32, 32);
+
+  // We measure costs in standardised milli-SM-cycles. This gives:
+  // smem load/store:    8 * byte count
+  // synchronisation:    1024 (assuming 4 warps per block)
+  int64_t convertLayoutCost = 16 * convertLayoutBytes + 1024;
+  int64_t rematerialisationCost = 0;
+
+  // Evaluate single-use status for every operation in slice
+  for (Operation *op : sliceOps) {
+    auto dialect = op->getDialect();
+    if (isOpSingleUse(op)) {
+      // when we rematerialise, this operation does not get duplicated
+      // so it does not contribute to our cost model:
+      continue;
+    } else if (isa<arith::ConstantOp>(op)) {
+      // special-case: arith.constant has zero cost
+      continue;
+    } else if (isa<LoadOp>(op)) {
+      // optimistically assume L1-cached:
+      for (Value result : op->getResults()) {
+        rematerialisationCost += 8 * getByteCount(result);
+      }
+    } else if (isa<arith::ArithDialect, math::MathDialect>(dialect)) {
+      // this is an arithmetic operation; we distinguish between cheap
+      // operations (such as floating point add/mul which can be fused
+      // as halves of a single-cycle FMA instruction) and expensive
+      // operations which use the special function unit and/or involve
+      // multiple instructions.
+      int64_t multiplier = isExpensiveMathOp(op) ? 8 : 1;
+      for (Value result : op->getResults()) {
+        rematerialisationCost += multiplier * getByteCount(result);
+      }
+    }
+  }
+
+  LLVM_DEBUG({
+    DBGS() << "  convert layout cost: " << convertLayoutCost << "\n";
+    DBGS() << "  rematerialisation cost: " << rematerialisationCost << "\n";
+  });
+
+  if (rematerialisationCost > convertLayoutCost) {
+    LDBG("  skipped rematerialization due to higher cost");
+    return;
+  }
+
   LLVM_DEBUG({
     DBGS() << "  remat convert op " << convertOp << '\n';
     for (Value v : slice)
       DBGS() << "    " << v << '\n';
   });
-  // 2. Rewrite the slice.
+
+  // 3. Rewrite the slice.
   rewriteSlice(slice, layout, convertOp);
 }
 
@@ -1179,30 +1313,32 @@ void LayoutRematerialization::hoistConvertDotOperand(
           { DBGS() << "  Block arguments not supported. Got " << v << "\n"; });
       return;
     }
-    auto loadOp = dyn_cast<LoadOp>(v.getDefiningOp());
-    // We expect the leaves of the slice to be Load or arith::Constant
-    // This could be generalised if necessary
-    if (!loadOp) {
+
+    // We expect the leaves of the slice to be Load, DescriptorLoad or
+    // arith::Constant This could be generalised if necessary
+    if (!isa<LoadOp, DescriptorLoadOp>(v.getDefiningOp())) {
       auto op = v.getDefiningOp();
       if (isa<arith::ConstantOp>(op) || noDataMovement(op)) {
         innerSlice.insert(v);
         continue;
       } else {
         LLVM_DEBUG({
-          DBGS() << "  Leaves must be Load or Constant. Got " << v << "\n";
+          DBGS() << "  Leaves must be Load, DescriptorLoad or Constant. Got "
+                 << v << "\n";
         });
         return;
       }
     }
+    Operation *loadOp = v.getDefiningOp();
     builder.setInsertionPointAfter(loadOp);
-    auto type = dyn_cast<RankedTensorType>(loadOp.getType());
+    auto type = dyn_cast<RankedTensorType>(loadOp->getResult(0).getType());
     if (!type)
       continue;
     auto newType = RankedTensorType::get(type.getShape(), type.getElementType(),
-                                         layout[loadOp]);
+                                         layout[loadOp->getResult(0)]);
     auto newConvertOp = builder.create<ConvertLayoutOp>(
-        convertOp.getLoc(), newType, loadOp.getResult());
-    mapping.map(loadOp.getResult(), newConvertOp.getResult());
+        convertOp.getLoc(), newType, loadOp->getResult(0));
+    mapping.map(loadOp->getResult(0), newConvertOp.getResult());
   }
 
   if (innerSlice.empty()) {
 
@@ -175,21 +175,20 @@ static TMemChunk allocFirstFit(MemoryBitMap &memoryMap,
 }
 
 static Operation *getAlloc(Value value) {
-  Operation *op = value.getDefiningOp();
-  while (auto subOp = dyn_cast<triton::gpu::MemDescSubviewOp>(op)) {
-    if (subOp.getSrc().getDefiningOp()) {
-      op = subOp.getSrc().getDefiningOp();
-    } else {
-      auto arg = cast<BlockArgument>(subOp.getSrc());
-      auto partitions =
-          cast<WarpSpecializePartitionsOp>(arg.getOwner()->getParentOp());
-      WarpSpecializeOp wsOp = partitions.getParentOp();
-      auto capture = wsOp.getExplicitCaptures()[arg.getArgNumber()];
-      op = capture.getDefiningOp();
+  while (true) {
+    if (auto allocOp = value.getDefiningOp<TMEMAllocOp>())
+      return allocOp;
+    if (auto subviewOp = value.getDefiningOp<triton::gpu::MemDescSubviewOp>()) {
+      value = subviewOp.getSrc();
+      continue;
     }
+    auto arg = dyn_cast<BlockArgument>(value);
+    if (!arg || !isa<WarpSpecializePartitionsOp>(arg.getOwner()->getParentOp()))
+      llvm::report_fatal_error("expected to find a TMEM alloc op");
+    auto partitions =
+        cast<WarpSpecializePartitionsOp>(arg.getOwner()->getParentOp());
+    value = partitions.getParentOp().getExplicitCaptures()[arg.getArgNumber()];
   }
-  assert(isa<triton::nvidia_gpu::TMEMAllocOp>(op) && "Expected a TMEMAllocOp");
-  return op;
 }
 
 class RowIdConstraints {
 
@@ -1088,7 +1088,13 @@ LinearLayout LinearLayout::removeZeroBasesAlongDim(StringAttr stripDim) const {
       }
     }
   }
-  return LinearLayout(std::move(result), llvm::to_vector(getOutDimNames()));
+  SmallVector<std::pair<StringAttr, int32_t>> newOutDimSizes;
+  for (auto outDim : getOutDimNames()) {
+    newOutDimSizes.push_back({outDim, getOutDimSize(outDim)});
+  }
+  auto newLayout = LinearLayout(std::move(result), ArrayRef(newOutDimSizes),
+                                this->isSurjective());
+  return newLayout;
 }
 
 size_t hash_value(const LinearLayout &layout) {
 
@@ -776,6 +776,22 @@ def get_git_version_suffix():
 # keep it separate for easy substitution
 TRITON_VERSION = "3.3.0" + get_git_version_suffix() + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", "")
 
+# Dynamically define supported Python versions and classifiers
+MIN_PYTHON = (3, 9)
+MAX_PYTHON = (3, 13)
+
+PYTHON_REQUIRES = f">={MIN_PYTHON[0]}.{MIN_PYTHON[1]},<{MAX_PYTHON[0]}.{MAX_PYTHON[1] + 1}"
+BASE_CLASSIFIERS = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Topic :: Software Development :: Build Tools",
+    "License :: OSI Approved :: MIT License",
+]
+PYTHON_CLASSIFIERS = [
+    f"Programming Language :: Python :: {MIN_PYTHON[0]}.{m}" for m in range(MIN_PYTHON[1], MAX_PYTHON[1] + 1)
+]
+CLASSIFIERS = BASE_CLASSIFIERS + PYTHON_CLASSIFIERS
+
 setup(
     name=os.environ.get("TRITON_WHEEL_NAME", "triton"),
     version=TRITON_VERSION,
@@ -807,17 +823,8 @@ def get_git_version_suffix():
     # for PyPI
     keywords=["Compiler", "Deep Learning"],
     url="https://github.com/triton-lang/triton/",
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Intended Audience :: Developers",
-        "Topic :: Software Development :: Build Tools",
-        "License :: OSI Approved :: MIT License",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: Python :: 3.12",
-        "Programming Language :: Python :: 3.13",
-    ],
+    python_requires=PYTHON_REQUIRES,
+    classifiers=CLASSIFIERS,
     test_suite="tests",
     extras_require={
         "build": [
Original file line number	Diff line number	Diff line change
`@@ -1088,7 +1088,13 @@ LinearLayout LinearLayout::removeZeroBasesAlongDim(StringAttr stripDim) const {`
`1088`	`1088`	`}`
`1089`	`1089`	`}`
`1090`	`1090`	`}`
`1091`		`- return LinearLayout(std::move(result), llvm::to_vector(getOutDimNames()));`
	`1091`	`+ SmallVector<std::pair<StringAttr, int32_t>> newOutDimSizes;`
	`1092`	`+ for (auto outDim : getOutDimNames()) {`
	`1093`	`+ newOutDimSizes.push_back({outDim, getOutDimSize(outDim)});`
	`1094`	`+ }`
	`1095`	`+ auto newLayout = LinearLayout(std::move(result), ArrayRef(newOutDimSizes),`
	`1096`	`+ this->isSurjective());`
	`1097`	`+ return newLayout;`
`1092`	`1098`	`}`
`1093`	`1099`
`1094`	`1100`	`size_t hash_value(const LinearLayout &layout) {`