intel
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 8 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 18 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py‎
Lines changed: 3 additions & 2 deletions b/‎benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 3 additions & 3 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/python-api/triton-semantics.rst‎
Lines changed: 1 addition & 3 deletions b/‎docs/python-api/triton-semantics.rst‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎docs/python-api/triton.language.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/python-api/triton.language.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 59 additions & 49 deletions b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 59 additions & 49 deletions
@@ -1,6 +1,7 @@
+default_stages: [pre-commit, pre-push, manual]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v5.0.0
     hooks:
       - id: check-symlinks
       - id: destroyed-symlinks
@@ -17,12 +18,11 @@ repos:
       - id: debug-statements
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.7.1
     hooks:
       - id: ruff
         files: '^python/.*'
-        args: ["--fix", "--line-length", "120"]
-        stages: [pre-commit, pre-push, manual]
+        args: ["--fix", "--exit-non-zero-on-fix"]
         exclude: |
           (?x)(
             ^python/triton/runtime/.*|
@@ -31,18 +31,16 @@ repos:
           )
 
   - repo: https://github.com/google/yapf
-    rev: be72557
+    rev: "7e21823"
     hooks:
       - id: yapf
         args: ["-p", "-i"]
-        stages: [pre-commit, pre-push, manual]
         exclude: "python/test/unit/language/test_line_info.py"
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.6
+    rev: v19.1.2
     hooks:
       - id: clang-format
-        stages: [pre-commit, pre-push, manual]
 
   # Expand YAML anchors in files used by github workflows, because github can't
   # do this itself.  This lets us use anchors, which avoids code duplication.
 
@@ -12,7 +12,7 @@ set(CMAKE_CXX_STANDARD 17)
 
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
-project(triton)
+project(triton CXX)
 include(CTest)
 
 if(NOT WIN32)
@@ -26,8 +26,25 @@ option(TRITON_BUILD_TUTORIALS "Build C++ Triton tutorials" ON)
 option(TRITON_BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF)
 option(TRITON_BUILD_PROTON "Build the Triton Proton profiler" ON)
 option(TRITON_BUILD_UT "Build C++ Triton Unit Tests" ON)
+option(TRITON_BUILD_WITH_CCACHE "Build with ccache (if available)" ON)
 set(TRITON_CODEGEN_BACKENDS "" CACHE STRING "Enable different codegen backends")
 
+if(TRITON_BUILD_WITH_CCACHE)
+  find_program(CCACHE_PROGRAM ccache)
+  if(CCACHE_PROGRAM)
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}"
+        CACHE STRING "C compiler launcher")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}"
+        CACHE STRING "CXX compiler launcher")
+  else()
+    message(
+      STATUS
+        "Could not find ccache. Consider installing ccache to speed up compilation."
+    )
+  endif()
+endif()
+
+
 # Ensure Python3 vars are set correctly
 # used conditionally in this file and by lit tests
 
 
@@ -78,10 +78,11 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out,  #
     start_m = tl.program_id(2)
     off_z = tl.program_id(0)
     off_h = tl.program_id(1)
+    qvk_offset = off_z.to(tl.int64) * stride_qz + off_h.to(tl.int64) * stride_qh
     if N_CTX <= 512:
         start_m = tl.program_id(0)
         off_z = tl.program_id(2)
-    qvk_offset = off_z.to(tl.int64) * stride_qz + off_h.to(tl.int64) * stride_qh
+        qvk_offset = off_z.to(tl.int64) * stride_qh
 
     # block pointers
     Q_block_ptr = tl.make_block_ptr(
@@ -181,7 +182,7 @@ def forward(q, k, v, causal, sm_scale):
     grid = lambda args: (q.shape[0], q.shape[1], triton.cdiv(q.shape[2], args['BLOCK_M']))
     n_ctx = q.shape[2]
     if n_ctx <= 512:
-        grid = lambda args: (triton.cdiv(q.shape[2], args['BLOCK_M']), q.shape[1], q.shape[0])
+        grid = lambda args: (triton.cdiv(q.shape[2], args['BLOCK_M']), 1, q.shape[0] * q.shape[1])
     M = torch.empty((q.shape[0], q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
 
     if os.getenv('TRITON_INTEL_ADVANCED_PATH', '0') == '0':
 
@@ -129,8 +129,8 @@ def matmul_kernel_with_block_pointers_batched(
         stride_cz: tl.constexpr, stride_cm: tl.constexpr, stride_cn: tl.constexpr,
         # Meta-parameters
         BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
-    bid = tl.program_id(axis=0)
-    pid = tl.program_id(axis=1)
+    bid = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
     num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
     num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
     num_pid_in_group = GROUP_SIZE_M * num_pid_n
@@ -186,8 +186,8 @@ def matmul(a, b, c, transpose_a=False, transpose_b=False):
         B = a.shape[0]
         # 1D launch kernel where each block gets its own program.
         grid = lambda META: (
-            B,
             triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
+            B,
         )
         matmul_kernel_with_block_pointers_batched[grid](
             a, b, c,  #
 
@@ -88,6 +88,8 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUStreamPipelineV2();
   mlir::registerTritonAMDGPUCanonicalizePointers();
   mlir::registerTritonAMDGPUConvertToBufferOps();
+  mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
+  mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
 
   // TODO: register Triton & TritonGPU passes
   registry.insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
 
@@ -1 +1 @@
-b5cc222d7429fe6f18c787f633d5262fac2e676f
+fa57c7a6a5f594a9e3ae2dbe3542cf89a20cdd73
@@ -145,7 +145,7 @@ def documenter(app, obj, parent):
 autosummary_generate = True
 
 # versioning config
-smv_tag_whitelist = r'^(v3.1.0)$'
+smv_tag_whitelist = r'^(v3.2.0)$'
 smv_branch_whitelist = r'^main$'
 smv_remote_whitelist = None
 smv_released_pattern = r'^tags/.*$'
 
@@ -14,9 +14,7 @@ The algorithm is as follows:
 
 2. **Width** If both tensors are of dtypes of the same kind, and one of them is of a higher width, the other one is promoted to this dtype: ``(float32, float16) -> float32``
 
-3. **Supremum** If both tensors are of the same width and signedness but different dtypes, they are both promoted to the next larger dtype. ``(float16, bfloat16) -> float32``
-
-   3.1 If both tensors are of different ``fp8`` dtypes, they are both cast to ``float16``.
+3. **Prefer float16** If both tensors are of the same width and signedness but different dtypes (``float16`` and ``bfloat16`` or different ``fp8`` types), they are both promoted to ``float16``. ``(float16, bfloat16) -> float16``
 
 4. **Prefer unsigned** Otherwise (same width, different signedness), they are promoted to the unsigned dtype: ``(int32, uint32) -> uint32``
 
 
@@ -59,6 +59,7 @@ Linear Algebra Ops
     :nosignatures:
 
     dot
+    dot_scaled
 
 
 Memory/Pointer Ops
 
@@ -93,6 +93,45 @@ class Allocation {
   using BufferIdSetT = DenseSet<BufferId>;
   using FuncAllocMapT = CallGraph<Allocation>::FuncDataMapT;
 
+  /// A class that represents a shared memory buffer
+  struct BufferT {
+    /// Explicit: triton_gpu.local_alloc
+    /// Scratch: triton_gpu.convert_layout
+    /// Virtual: triton.call
+    enum class BufferKind { Explicit, Scratch, Virtual };
+
+    /// MT: thread-safe
+    inline static std::atomic<BufferId> nextId = 0;
+
+    BufferKind kind;
+    BufferId id;
+    size_t size;
+    size_t alignment;
+    size_t offset;
+
+    bool operator==(const BufferT &other) const { return id == other.id; }
+    bool operator<(const BufferT &other) const { return id < other.id; }
+
+    BufferT() : BufferT(BufferKind::Explicit, 0) {}
+    BufferT(BufferKind kind, size_t size, size_t alignment = 4,
+            size_t offset = 0)
+        : kind(kind), id(nextId++), size(size), alignment(alignment),
+          offset(offset) {}
+
+    size_t setOffsetAligned(size_t newOffset) {
+      return offset = llvm::alignTo(newOffset, alignment);
+    }
+  };
+
+  /// Op -> Scratch Buffer
+  using OpScratchMapT = DenseMap<Operation *, BufferT *>;
+  /// Value -> Explicit Buffer
+  using ValueBufferMapT = llvm::MapVector<Value, BufferT *>;
+  /// Value -> Alias Buffer
+  using AliasBufferMapT = llvm::MapVector<Value, llvm::SetVector<BufferT *>>;
+  /// BufferId -> Buffer
+  using BufferSetT = std::map<BufferId, BufferT>;
+
   static constexpr BufferId InvalidBufferId =
       std::numeric_limits<BufferId>::max();
 
@@ -102,11 +141,17 @@ class Allocation {
   explicit Allocation(Operation *operation) : operation(operation) {}
 
   /// Runs allocation analysis on the given top-level operation.
-  void run(FuncAllocMapT &funcAllocMap);
+  template <typename AllocationAnalysis> void run(FuncAllocMapT &funcAllocMap);
 
   /// Returns the operation this analysis was constructed from.
   Operation *getOperation() const { return operation; }
 
+  const OpScratchMapT &getOpScratch() const { return opScratch; }
+  const OpScratchMapT &getOpVirtual() const { return opVirtual; }
+  const ValueBufferMapT &getValueBuffer() const { return valueBuffer; }
+  const AliasBufferMapT &getAliasBuffer() const { return aliasBuffer; }
+  void setSharedMemorySize(size_t size) { sharedMemorySize = size; }
+
   /// Returns the offset of the given buffer in the shared memory.
   size_t getOffset(BufferId bufferId) const {
     return bufferSet.at(bufferId).offset;
@@ -170,47 +215,6 @@ class Allocation {
   /// Returns mapping from operation to list of live LDS buffers
   std::map<Operation *, SmallVector<BufferId>> getLiveBuffers();
 
-private:
-  /// A class that represents a shared memory buffer
-  struct BufferT {
-    /// Explicit: triton_gpu.local_alloc
-    /// Scratch: triton_gpu.convert_layout
-    /// Virtual: triton.call
-    enum class BufferKind { Explicit, Scratch, Virtual };
-
-    /// MT: thread-safe
-    inline static std::atomic<BufferId> nextId = 0;
-
-    BufferKind kind;
-    BufferId id;
-    size_t size;
-    size_t alignment;
-    size_t offset;
-
-    bool operator==(const BufferT &other) const { return id == other.id; }
-    bool operator<(const BufferT &other) const { return id < other.id; }
-
-    BufferT() : BufferT(BufferKind::Explicit, 0) {}
-    BufferT(BufferKind kind, size_t size, size_t alignment = 4,
-            size_t offset = 0)
-        : kind(kind), id(nextId++), size(size), alignment(alignment),
-          offset(offset) {}
-
-    size_t setOffsetAligned(size_t newOffset) {
-      return offset = llvm::alignTo(newOffset, alignment);
-    }
-  };
-
-  /// Op -> Scratch Buffer
-  using OpScratchMapT = DenseMap<Operation *, BufferT *>;
-  /// Value -> Explicit Buffer
-  using ValueBufferMapT = llvm::MapVector<Value, BufferT *>;
-  /// Value -> Alias Buffer
-  using AliasBufferMapT = llvm::MapVector<Value, llvm::SetVector<BufferT *>>;
-  /// BufferId -> Buffer
-  using BufferSetT = std::map<BufferId, BufferT>;
-
-private:
   template <BufferT::BufferKind Kind, typename KeyType, typename... Args>
   void addBuffer(KeyType &key, Args &&...args) {
     auto buffer = BufferT(Kind, std::forward<Args>(args)...);
@@ -236,10 +240,11 @@ class Allocation {
   AliasBufferMapT aliasBuffer;
   BufferSetT bufferSet;
   size_t sharedMemorySize = 0;
-
-  friend class triton::AllocationAnalysis;
 };
 
+template <>
+void Allocation::run<triton::AllocationAnalysis>(FuncAllocMapT &funcAllocMap);
+
 /// Static analysis that computes the allocation of shared memory buffers
 /// of the entire call graph.
 /// The allocation is performed in a post-order walk of the call graph.
@@ -250,17 +255,19 @@ class ModuleAllocation : public CallGraph<Allocation> {
 public:
   using FuncOffsetMapT = DenseMap<FunctionOpInterface, Value>;
 
-  explicit ModuleAllocation(ModuleOp moduleOp)
-      : CallGraph<Allocation>(moduleOp) {
-    walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
+  template <typename AllocationAnalysis = triton::AllocationAnalysis>
+  static ModuleAllocation get(ModuleOp moduleOp) {
+    ModuleAllocation res(moduleOp);
+    res.walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
         // Pre-order edge walk callback
         [](CallOpInterface callOp, FunctionOpInterface funcOp) {},
         // Post-order node walk callback
         [&](FunctionOpInterface funcOp) {
-          auto [iter, inserted] = funcMap.try_emplace(funcOp, funcOp);
+          auto [iter, inserted] = res.funcMap.try_emplace(funcOp, funcOp);
           if (inserted)
-            iter->second.run(funcMap);
+            iter->second.template run<AllocationAnalysis>(res.funcMap);
         });
+    return res;
   }
 
   size_t getSharedMemorySize() {
@@ -285,6 +292,9 @@ class ModuleAllocation : public CallGraph<Allocation> {
   }
 
 private:
+  explicit ModuleAllocation(ModuleOp moduleOp)
+      : CallGraph<Allocation>(moduleOp) {}
+
   FuncOffsetMapT sharedMemoryValue;
 };
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-b5cc222d7429fe6f18c787f633d5262fac2e676f`
	`1`	`+fa57c7a6a5f594a9e3ae2dbe3542cf89a20cdd73`