intel
diff --git a/‎RELEASE.md‎
Lines changed: 50 additions & 1 deletion b/‎RELEASE.md‎
Lines changed: 50 additions & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Conversion/TritonGPUToLLVM/TypeConverter.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h‎
Lines changed: 0 additions & 6 deletions b/‎include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Dialect.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/Triton/IR/Dialect.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td‎
Lines changed: 3 additions & 22 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td‎
Lines changed: 3 additions & 22 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 2 additions & 19 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 2 additions & 19 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 3 additions & 7 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 3 additions & 7 deletions
@@ -1,4 +1,53 @@
-# Release Process
+# Releasing Triton
+
+Triton releases provide a stable snapshot of the code base encapsulated into a binary that can easily be consumed through PyPI. Additionally, releases represent points in time when we, as the development team, can signal to the community that certain new features are available, what improvements have been made, and any changes that are coming that may impact them (i.e. breaking changes).
+
+## Release Compatibility Matrix
+
+Following is the Release Compatibility Matrix for Triton releases:
+
+| Triton version | Python version | Manylinux version |
+| --- | --- | --- |
+| 3.2.0 | >=3.9, <=3.13 | glibc 2.17+ x86-64 |
+| 3.1.0 | >=3.8, <=3.12 | glibc 2.17+ x86-64 |
+| 3.0.0 | >=3.8, <=3.12 | glibc 2.17+ x86-64 |
+| 2.3.1 | >=3.7, <=3.12 | glibc 2.17+ x86-64 |
+| 2.3.0 | >=3.7, <=3.12 | glibc 2.17+ x86-64 |
+| 2.2.0 | >=3.7, <=3.12 | glibc 2.17+ x86-64 |
+| 2.1.0 | >=3.7, <=3.11 | glibc 2.17+ x86-64 |
+| 2.0.0 | >=3.6, <=3.11 | glibc 2.17+ x86-64 |
+| 1.1.1 | >=3.6, <=3.9 | glibc 2.17+ x86-64 |
+| 1.1.0 | >=3.6, <=3.9 | glibc 2.17+ x86-64 |
+| 1.0.0 | >=3.6, <=3.9 | glibc 2.17+ x86-64 |
+
+## Release Cadence
+
+Following is the release cadence for year 2024/2025. All future release dates below are tentative. Please note: Patch Releases are optional.
+
+| Minor Version | Release branch cut | Release date | Patch Release date |
+| --- | --- | --- | --- |
+| 3.5.0 | Sep 2025 | Oct 2025 | --- |
+| 3.4.0 | Jun 2025 | Jul 2025 | --- |
+| 3.3.0 | Feb/Mar 2025 | Apr 2025 | --- |
+| 3.2.0 | Dec 2024 | Jan 2025 | --- |
+| 3.1.0 | Jun 2024 | Oct 2024 | --- |
+| 3.0.0 | Jun 2024 | Jul 2024 | --- |
+| 2.3.0 | Dec 2023 | Apr 2024 | May 2024 |
+| 2.2.0 | Dec 2023 | Jan 2024 | --- |
+
+## Release Cherry-Pick Criteria
+
+After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
+
+* Regression fixes - that address functional/performance regression against the most recent release (e.g. 3.2 for 3.3 release)
+* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
+* Fixes to new features introduced in the most recent release (e.g. 3.2 for 3.3 release)
+* Documentation improvements
+* Release branch specific changes (e.g. change version identifiers or CI fixes)
+
+Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes. An issue is for tracking cherry-picks to the release branch is created after the branch cut. **Only issues that have ‘cherry-picks’ in the issue tracker will be considered for the release.**
+
+# Intel Release Process
 
 Intel XPU Backend for Triton releases are aligned to the upstream `triton-lang/triton` project and to `PyTorch`. To make a release:
 
 
@@ -100,7 +100,7 @@ void populateSPMDOpToLLVMPattern(LLVMTypeConverter &typeConverter,
                                  PatternBenefit benefit);
 
 void populateFuncOpConversionPattern(LLVMTypeConverter &typeConverter,
-                                     RewritePatternSet &patterns, int numWarps,
+                                     RewritePatternSet &patterns,
                                      const TargetInfoBase &targetInfo,
                                      PatternBenefit benefit);
 
 
@@ -14,7 +14,11 @@ class TritonGPUToLLVMTypeConverter : public LLVMTypeConverter {
 public:
   using TypeConverter::convertType;
 
-  TritonGPUToLLVMTypeConverter(MLIRContext *ctx, LowerToLLVMOptions &option,
+  TritonGPUToLLVMTypeConverter(MLIRContext *ctx,
+                               const LowerToLLVMOptions &option,
+                               const TargetInfoBase &targetInfo,
+                               const DataLayoutAnalysis *analysis = nullptr);
+  TritonGPUToLLVMTypeConverter(MLIRContext *ctx,
                                const TargetInfoBase &targetInfo,
                                const DataLayoutAnalysis *analysis = nullptr);
 
 
@@ -12,12 +12,6 @@ template <typename T> class OperationPass;
 
 namespace triton {
 
-constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
-constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
-constexpr static char AttrTargetName[] = "ttg.target";
-
-constexpr static char AttrNumThreadsPerWarp[] = "ttg.threads-per-warp";
-
 // Create the pass with numWarps passed from cl::opt.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonToTritonGPUPass();
 
 
@@ -91,7 +91,7 @@ class DialectVerifyTensorLayoutInterface
   DialectVerifyTensorLayoutInterface(Dialect *dialect) : Base(dialect) {}
 
   virtual LogicalResult
-  verifyTensorLayout(Attribute layout, RankedTensorType type, ModuleOp module,
+  verifyTensorLayout(Attribute layout, RankedTensorType type, Operation *op,
                      function_ref<InFlightDiagnostic()> emitError) const = 0;
 };
 
 
@@ -1118,7 +1118,11 @@ def CallOp : TT_Op<"call", [CallOpInterface, /*MemRefsNormalizable, */DeclareOpI
   }];
 }
 
-def FuncOp : TT_Op<"func", [AffineScope, AutomaticAllocationScope, CallableOpInterface, FunctionOpInterface, IsolatedFromAbove, OpAsmOpInterface]> {
+def FuncOp : TT_Op<"func", [
+    AffineScope, AutomaticAllocationScope, CallableOpInterface,
+    FunctionOpInterface, IsolatedFromAbove, OpAsmOpInterface,
+    HasParent<"ModuleOp">
+]> {
   let summary = "An operation with a name containing a single `SSACFG` region";
   let description = [{
     Operations within the function cannot implicitly capture values defined
 
@@ -39,6 +39,13 @@ template <> struct hash<CacheKey> {
 
 namespace mlir::triton::gpu {
 
+constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
+constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
+constexpr static char AttrTargetName[] = "ttg.target";
+constexpr static char AttrNumThreadsPerWarp[] = "ttg.threads-per-warp";
+
+int lookupNumWarps(Operation *op);
+
 class LinearLayoutCache {
 public:
   std::optional<LinearLayout> get(const CacheKey &key) {
 
@@ -20,32 +20,13 @@ def TritonGPU_Dialect : Dialect {
   ];
 
   let extraClassDeclaration = [{
-    static std::string getNumWarpsAttrName() { return "ttg.num-warps"; }
-    static int getNumWarps(ModuleOp mod) {
-      if (!mod->hasAttr("ttg.num-warps"))
-        llvm::report_fatal_error(
-            "TritonGPU module should contain a ttg.num-warps attribute");
-      return cast<IntegerAttr>(mod->getAttr("ttg.num-warps")).getInt();
-    }
-    static int getNumCTAs(ModuleOp mod) {
-      if (!mod->hasAttr("ttg.num-ctas"))
-        return 1;
-      return cast<IntegerAttr>(mod->getAttr("ttg.num-ctas")).getInt();
-    }
     void registerTypes();
 
-    static std::string getThreadsPerWarpAttrName() { return "ttg.threads-per-warp"; }
-
-    static int getThreadsPerWarp(ModuleOp mod) {
-      Attribute threadsPerWarp = mod->getDiscardableAttr("ttg.threads-per-warp");
-      if(!threadsPerWarp) {
-        return 32;
-      }
-      return cast<IntegerAttr>(threadsPerWarp).getInt();
-    }
-
     LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
 
+    static int getNumCTAs(ModuleOp mod);
+    static int getThreadsPerWarp(ModuleOp mod);
+
     private:
       LinearLayoutCache llCache;
   }];
 
@@ -106,27 +106,10 @@ class CoarseSchedule {
     return true;
   }
 
-  void insertMinimum(Operation *op, int stage, Cluster cluster) {
-    auto res = opToStageAndCluster.insert({op, {stage, cluster}});
-    if (res.second) {
-      return;
-    }
-    auto &[existingStage, existingCluster] = res.first->second;
-    existingStage = std::min(stage, existingStage);
-
-    // If existingCluster is reachable from cluster,
-    // then cluster is earlier in the list
-    auto it = cluster;
-    for (auto it = cluster; it != clusters.end(); ++it) {
-      if (it == existingCluster) {
-        existingCluster = cluster;
-        return;
-      }
-    }
-  }
+  bool insertMinimum(Operation *op, int stage, Cluster cluster);
 
   bool insertDepsOfOp(Operation *op, int stage, CoarseSchedule::Cluster cluster,
-                      bool includeArg);
+                      bool includeArg, bool insertIfEarlier = false);
 
   void erase(Operation *op) { opToStageAndCluster.erase(op); }
 
 
@@ -42,9 +42,7 @@
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonNvidiaGPU/IR/Ops.h.inc"
 
-namespace mlir {
-namespace triton {
-namespace nvidia_gpu {
+namespace mlir::triton::nvidia_gpu {
 
 struct TensorMemory : public SideEffects::Resource::Base<TensorMemory> {
   StringRef getName() final { return "<TensorMemory>"; }
@@ -63,12 +61,10 @@ Attribute getTmemCompatibleLayout(unsigned M, unsigned N,
                                   ArrayRef<int64_t> shape, unsigned numWarps,
                                   triton::gpu::CTALayoutAttr ctaLayout);
 
-bool isDistributedLayoutTMemCompatible(ModuleOp mod,
+bool isDistributedLayoutTMemCompatible(Operation *op,
                                        RankedTensorType tensorType,
                                        gpu::MemDescType memType);
 
-} // namespace nvidia_gpu
-} // namespace triton
-} // namespace mlir
+} // namespace mlir::triton::nvidia_gpu
 
 #endif // TRITON_DIALECT_TRITONNVIDIAGPU_IR_DIALECT_H_