intel
diff --git a/‎.github/WINDOWS.md‎
Lines changed: 1 addition & 1 deletion b/‎.github/WINDOWS.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build-test-python.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build-test-python.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/nightly-wheels.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/nightly-wheels.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/try-latest-pytorch.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/try-latest-pytorch.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/wheels-pytorch.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/wheels-pytorch.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/wheels-triton.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/wheels-triton.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 16 additions & 5 deletions b/‎README.md‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py‎
Lines changed: 55 additions & 52 deletions b/‎benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py‎
Lines changed: 55 additions & 52 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/fused_softmax.py‎
Lines changed: 9 additions & 9 deletions b/‎benchmarks/triton_kernels_benchmark/fused_softmax.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 3 additions & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 3 additions & 0 deletions
@@ -56,7 +56,7 @@ If you do not have a system Python installed at this step, you can install one w
 For example:
 
 ```
-choco install python --version=3.9.13
+choco install python --version=3.10.11
 ```
 
 ### Git
 
@@ -58,9 +58,9 @@ jobs:
         id: matrix
         run: |
           if [[ -n "${{ inputs.runner_label }}" ]]; then
-            matrix='{"python": ["3.9", "3.10", "3.11", "3.12", "3.13"]}'
+            matrix='{"python": ["3.10", "3.11", "3.12", "3.13"]}'
           else
-            matrix='{"python": ["3.9", "3.10", "3.11", "3.12", "3.13"], "driver": ["rolling", "lts"]}'
+            matrix='{"python": ["3.10", "3.11", "3.12", "3.13"], "driver": ["rolling", "lts"]}'
           fi
           echo "matrix=$matrix" | tee -a $GITHUB_OUTPUT
 
 
@@ -32,7 +32,6 @@ jobs:
     strategy:
       matrix:
         python:
-          - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"
 
@@ -68,7 +68,7 @@ jobs:
       - name: Matrix
         id: matrix
         run: |
-          integration_matrix='{"python": ["3.9", "3.10", "3.11", "3.12"], "driver": ["rolling", "lts"]}'
+          integration_matrix='{"python": ["3.10", "3.11", "3.12"], "driver": ["rolling", "lts"]}'
 
           echo "integration_matrix=$integration_matrix" | tee -a $GITHUB_OUTPUT
           e2e_matrix='{
@@ -97,7 +97,7 @@ jobs:
         inductor/test_max_autotune.py
         inductor/test_compile_subprocess.py
       runner_label: ${{ inputs.runner_label }}
-      python_version: "3.9"
+      python_version: "3.10"
 
   integration-tests:
     name: Integration tests
 
@@ -21,7 +21,6 @@ jobs:
     strategy:
       matrix:
         python:
-          - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"
 
@@ -16,7 +16,6 @@ jobs:
     strategy:
       matrix:
         python:
-          - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"
 
@@ -1,15 +1,26 @@
-<div align="center">
-  <img src="https://lh5.googleusercontent.com/wzQKEsTFkrgNQO9JjhGH5wFvslJr1saLtLaJ_a6Fp_gNENpvt3VG7BmztwngU9hFJaU4CPwGiw1opQtDvTkLrxWRbO_a12Q-pdESWHgtmheIHcPbOL5ZMC4TSiJVe5ty1w=w3517" alt="Triton logo">
-</div>
 
 | **`Documentation`** | **`Nightly Wheels`** |
 |-------------------- | -------------------- |
 | [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) |
 
-# Conference Registration
+# Triton Conference 2025
+
+![Triton Registration Banner](https://github.com/user-attachments/assets/b4b6972a-857c-417f-bf2c-f16f38a358c0)
+
+### Registration
 
 The 3rd Triton conference is scheduled to take place on October 21, 2025. Click [here](https://tritonconference.eventbuilder.com/TritonDeveloperConference) to register!
 
+### Poster Submission
+
+We invite members of the Triton community who are attending the Triton Developer Conference to present posters about their Triton-related technical work.
+
+Please submit basic information of your poster, including author information and abstract using this [form](https://forms.gle/QfgTF8o1CWNENAnA7).
+
+**Important Dates**
+- Submission: 10/1/2025
+- Author notification: 10/7/2025
+- Final version (PDF): 10/14/2025
 
 # Triton
 
@@ -27,7 +38,7 @@ You can install the latest stable release of Triton from pip:
 pip install triton
 ```
 
-Binary wheels are available for CPython 3.9-3.13.
+Binary wheels are available for CPython 3.10-3.13.
 
 # Install from source
 
 
@@ -154,6 +154,19 @@ def _attn_fwd_with_block_pointers(Q, K, V, sm_scale, M, Out,  #
     # epilogue
     m_i += tl.math.log2(l_i)
     acc = acc / l_i[:, None]
+    if N_CTX <= 512:
+        off_hz = off_z + off_h * H
+    else:
+        off_hz = off_z * H + off_h
+    M_block_ptr = tl.make_block_ptr(
+        base=M + off_hz * N_CTX,
+        shape=[N_CTX],
+        strides=[1],
+        offsets=[start_m * BLOCK_M],
+        block_shape=[BLOCK_M],
+        order=[0],
+    )
+    tl.store(M_block_ptr, m_i)
     tl.store(O_block_ptr, acc.to(Out.type.element_ty), boundary_check=(0, 1))
 
 
@@ -220,7 +233,7 @@ def _attn_bwd_dkdv(dk, dv,  #
         if MASK:
             mask = (offs_m[None, :] >= offs_n[:, None])
             pT = tl.where(mask, pT, 0.0)
-        do = tl.load(do_ptrs).to(tl.float16)
+        do = tl.load(do_ptrs)
         # Compute dV.
         ppT = pT
         ppT = ppT.to(tl.float16)
@@ -275,7 +288,7 @@ def _attn_bwd_dq(dq, q, K, V,  #
             mask = (offs_m[:, None] >= offs_n[None, :])
             p = tl.where(mask, p, 0.0)
         # Compute dP and dS.
-        dp = tl.dot(do.to(tl.float16), vT).to(tl.float32)
+        dp = tl.dot(do, vT).to(tl.float32)
         ds = p * (dp - Di[:, None])
         ds = ds.to(tl.float16)
         # Compute dQ.
@@ -423,12 +436,12 @@ class _attention(torch.autograd.Function):
     attn_fwd: Callable = None
 
     @staticmethod
-    def forward(ctx, q, k, v, causal, sm_scale, dq, dk, dv, delta):
+    def forward(ctx, q, k, v, causal, sm_scale):
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         assert Lq == Lk and Lk == Lv
         assert Lk in {16, 32, 64, 128}
-        o = torch.empty_like(q, dtype=torch.float32)
+        o = torch.empty_like(q)
         BLOCK_M = 128
         BLOCK_N = 64
         num_stages = 3
@@ -473,8 +486,7 @@ def forward(ctx, q, k, v, causal, sm_scale, dq, dk, dv, delta):
                 advanced_path=True,  #
             )
 
-        ctx.save_for_backward(q, k, v, o, M, dq, dk, dv, delta)
-        ctx.grid = grid
+        ctx.save_for_backward(q, k, v, o, M)
         ctx.sm_scale = sm_scale
         ctx.HEAD_DIM = Lk
         ctx.causal = causal
@@ -488,9 +500,12 @@ def backward(ctx, do):
         with record_function(
                 '__profile_kernel_of_func_bwd_fa'
         ) if benchmark_suite.BENCHMARKING_METHOD == 'UPSTREAM_PYTORCH_PROFILER' else contextlib.nullcontext():
-            q, k, v, o, M, dq, dk, dv, delta = ctx.saved_tensors
+            q, k, v, o, M = ctx.saved_tensors
             assert do.is_contiguous()
             assert q.stride() == k.stride() == v.stride() == o.stride() == do.stride()
+            dq = torch.empty_like(q)
+            dk = torch.empty_like(k)
+            dv = torch.empty_like(v)
             BATCH, N_HEAD, N_CTX = q.shape[:3]
             PRE_BLOCK = 128
             NUM_WARPS, NUM_STAGES = 4, 5
@@ -502,6 +517,7 @@ def backward(ctx, do):
             PRE_BLOCK = 128
             assert N_CTX % PRE_BLOCK == 0
             pre_grid = (N_CTX // PRE_BLOCK, BATCH * N_HEAD)
+            delta = torch.empty_like(M)
             _attn_bwd_preprocess[pre_grid](
                 o, do,  #
                 delta,  #
@@ -522,7 +538,7 @@ def backward(ctx, do):
                 num_stages=NUM_STAGES  #
             )
 
-        return dq, dk, dv, None, None, None, None, None, None
+        return dq, dk, dv, None, None, None, None
 
 
 attention = _attention.apply
@@ -537,6 +553,9 @@ def get_benchmark(
     Returns a Mark object containing a Benchmark object constructed at runtime and parameterized by the provided option values.
     The benchmark can then be executed by calling the :code:`.run` method on the return value.
     """
+    causal_mode = [False, True] if fa_kernel_mode == 'fwd' else [
+        True
+    ]  # The 06 tutorial bwd Non-causal tests do not pass at the moment.
 
     supported_providers = {
         'triton': 'Triton',
@@ -556,9 +575,9 @@ def get_benchmark(
             x_vals=[[z, h, 16384 // z, dhead, causal, mode]
                     for z in [1, 2, 4, 8, 16, 32]
                     for (h, dhead) in [(16, 128), (32, 64)]
-                    for causal in [False, True]
+                    for causal in causal_mode
                     for mode in [fa_kernel_mode]]  #
-            + [[4, 48, 1024, 64, causal, mode] for causal in [False, True] for mode in [fa_kernel_mode]],
+            + [[4, 48, 1024, 64, causal, mode] for causal in causal_mode for mode in [fa_kernel_mode]],
             line_arg='provider',
             # argument name whose value corresponds to a different line in the plot
             # possible values for `line_arg``
@@ -587,60 +606,44 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, MODE, provider):
         if MODE not in modes:
             raise AssertionError(f'Unknown {MODE}, supported modes are {modes}')
         dtype = torch.float16
+        torch.xpu.empty_cache()
         q = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype, requires_grad=True)
         k = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype, requires_grad=True)
         v = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype, requires_grad=True)
         sm_scale = 0.125
-        dq, dk, dv, delta = None, None, None, None
-        if MODE == 'bwd':
-            sm_scale = 1.3
-            dq = torch.empty_like(q)
-            dk = torch.empty_like(k)
-            dv = torch.empty_like(v)
-            delta = torch.empty_like(q)
         quantiles = [0.5, 0.0, 1.0]
         atol = 1e-1 if N_CTX == 16384 else 1e-2
+        bwd_atol = 1e-1 if N_CTX >= 4096 else 1e-2
         # FIXME: use torch sdpa for result check after https://github.com/intel/intel-xpu-backend-for-triton/issues/2042 fixed
         torch_fn = lambda: torch.nn.functional.scaled_dot_product_attention(q.cpu(), k.cpu(), v.cpu(
-        ), attn_mask=None, dropout_p=0.0, is_causal=CAUSAL, scale=sm_scale).to(torch.float32)
-        if MODE == 'bwd':
-            torch_o = torch_fn()
-            torch_do = torch.randn_like(torch_o)
-            torch_fn = lambda: torch_o.backward(torch_do, retain_graph=True)
-
-        if provider == 'onednn':
-            _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(
-                torch_fn,
-                n_warmup=n_warmup,
-                n_repeat=10,
-                quantiles=quantiles,
-                time_warmup=False,
-            )
+        ), attn_mask=None, dropout_p=0.0, is_causal=CAUSAL, scale=sm_scale)
 
-        elif provider == 'triton':
-            triton_fn = lambda: attention(q, k, v, CAUSAL, sm_scale, dq, dk, dv, delta)
-            if MODE == 'bwd':
-                triton_o = triton_fn()
-                triton_do = torch.randn_like(triton_o)
-                triton_fn = lambda: triton_o.backward(triton_do, retain_graph=True)
+        if provider == 'triton':
+            triton_fn = lambda: attention(q, k, v, CAUSAL, sm_scale)
             if MODE == 'fwd':
                 benchmark_suite.assert_close(triton_fn, torch_fn, atol=atol, rtol=1e-3, err_msg='triton to torch')
             else:
-                benchmark_suite.assert_close(
-                    lambda: triton_o,
-                    lambda: torch_o,
-                    atol=1e-2,
-                    rtol=0,
-                    err_msg='triton to torch',
-                )
-
-            _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(
-                triton_fn,
-                n_warmup=n_warmup,
-                n_repeat=10,
-                quantiles=quantiles,
-                time_warmup=False,
-            )
+                dout = torch.randn_like(q)
+                torch_o = torch_fn()
+                torch_grads = torch.autograd.grad((torch_o, ), (q, k, v), dout.cpu(), retain_graph=True)
+                eager_tensors = torch_grads
+                triton_o = triton_fn()
+                triton_grads = torch.autograd.grad((triton_o, ), (q, k, v), dout, retain_graph=True)
+                compiled_tensors = triton_grads
+
+                benchmark_suite.assert_close(lambda: torch_o, lambda: triton_o, atol=atol, rtol=1e-3,
+                                             err_msg='Error comparing out between triton and torch')
+
+                tensor_names = ['grad_query', 'grad_key', 'grad_value']
+                for eager, compiled, name in zip(eager_tensors, compiled_tensors, tensor_names):
+                    benchmark_suite.assert_close(lambda eager=eager: eager, lambda compiled=compiled: compiled,
+                                                 atol=bwd_atol, rtol=1e-3,
+                                                 err_msg=f'Error comparing {name} between triton and torch')
+                triton_fn = lambda: triton_o.backward(dout, retain_graph=True)
+
+            _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(triton_fn, n_warmup=n_warmup, n_repeat=10,
+                                                                   quantiles=quantiles, grad_to_none=(q, k, v),
+                                                                   time_warmup=False)
 
         elif provider == 'xetla':
             if MODE == 'bwd':
 
@@ -41,15 +41,15 @@ def naive_softmax(x):
 
 @triton.autotune(
     configs=[
-        triton.Config({"threads_per_warp": 32}, num_warps=32),
-        triton.Config({"threads_per_warp": 32}, num_warps=16),
-        triton.Config({"threads_per_warp": 32}, num_warps=8),
-        triton.Config({"threads_per_warp": 32}, num_warps=4),
-        triton.Config({"threads_per_warp": 16}, num_warps=64),
-        triton.Config({"threads_per_warp": 16}, num_warps=32),
-        triton.Config({"threads_per_warp": 16}, num_warps=16),
-        triton.Config({"threads_per_warp": 16}, num_warps=8),
-        triton.Config({"threads_per_warp": 16}, num_warps=4),
+        triton.Config({"warp_size": 32}, num_warps=32),
+        triton.Config({"warp_size": 32}, num_warps=16),
+        triton.Config({"warp_size": 32}, num_warps=8),
+        triton.Config({"warp_size": 32}, num_warps=4),
+        triton.Config({"warp_size": 16}, num_warps=64),
+        triton.Config({"warp_size": 16}, num_warps=32),
+        triton.Config({"warp_size": 16}, num_warps=16),
+        triton.Config({"warp_size": 16}, num_warps=8),
+        triton.Config({"warp_size": 16}, num_warps=4),
     ],
     key=["BLOCK_SIZE_X", "BLOCK_SIZE_Y"],
 )
 
@@ -47,6 +47,7 @@
 
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
 #include "mlir/InitAllPasses.h"
 
 namespace mlir {
@@ -107,6 +108,8 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::registerConvertTritonGENToLLVM();
   mlir::triton::registerTritonGENToLLVMPasses();
   mlir::triton::registerTritonGENToSPIRVPasses();
+  mlir::LLVM::registerInlinerInterface(registry);
+  mlir::NVVM::registerInlinerInterface(registry);
 
   // TritonAMDGPUToLLVM passes
   mlir::triton::registerAllocateAMDGPUSharedMemory();
-Original file line number
+Diff line change
     strategy:
       matrix:
         python:
 -          - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"