intel
diff --git a/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/workflows/build-test-windows.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/build-test-windows.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py‎
Lines changed: 15 additions & 17 deletions b/‎benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py‎
Lines changed: 15 additions & 17 deletions
diff --git a/‎python/test/gluon/test_consan.py‎
Lines changed: 9 additions & 9 deletions b/‎python/test/gluon/test_consan.py‎
Lines changed: 9 additions & 9 deletions
@@ -196,6 +196,7 @@ jobs:
         suite:
           - minicore
           - scaled_dot
+          - gluon
           - rest
           - tutorial-fa-64
           - tutorial-fa-128-fwdfp8
@@ -306,6 +307,11 @@ jobs:
         run: |
           ${{ env.TRITON_TEST_CMD }} --scaled-dot
 
+      - name: Run gluon tests
+        if: matrix.suite == 'gluon' && inputs.driver_version == 'rolling'
+        run: |
+          ${{ env.TRITON_TEST_CMD }} --gluon
+
       - name: Run interpreter tests
         if: matrix.suite == 'rest'
         run: |
 
@@ -148,6 +148,13 @@ jobs:
           cd ${{ env.NEW_WORKSPACE }}
           ${{ env.TRITON_TEST_CMD }} --core
 
+      - name: Run gluon tests
+        run: |
+          .venv\Scripts\activate.ps1
+          Invoke-BatchFile "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+          cd ${{ env.NEW_WORKSPACE }}
+          ${{ env.TRITON_TEST_CMD }} --gluon
+
       - name: Run triton kernels tests
         run: |
           .venv\Scripts\activate.ps1
 
@@ -213,17 +213,18 @@ def _attn_bwd_dkdv(dk, dv,  #
                    # Filled in by the wrapper.
                    start_n, start_m, num_steps,  #
                    MASK: tl.constexpr):
-    offs_m = start_m + tl.arange(0, BLOCK_M1)
     offs_n = start_n + tl.arange(0, BLOCK_N1)
-    offs_k = tl.arange(0, HEAD_DIM)
-    qT_ptrs = Q + offs_m[None, :] * stride_tok + offs_k[:, None] * stride_d
-    do_ptrs = DO + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d
+    qT_desc = tl.make_tensor_descriptor(Q, shape=[HEAD_DIM, N_CTX], strides=[stride_d, stride_tok],
+                                        block_shape=[HEAD_DIM, BLOCK_M1])
+
+    do_desc = tl.make_tensor_descriptor(DO, shape=[N_CTX, HEAD_DIM], strides=[stride_tok, stride_d],
+                                        block_shape=[BLOCK_M1, HEAD_DIM])
     # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
     tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
     curr_m = start_m
     step_m = BLOCK_M1
     for blk_idx in range(num_steps):
-        qT = tl.load(qT_ptrs)
+        qT = qT_desc.load([0, start_m + blk_idx * step_m])
         # Load m before computing qk to reduce pipeline stall.
         offs_m = curr_m + tl.arange(0, BLOCK_M1)
         m = tl.load(M + offs_m)
@@ -233,7 +234,7 @@ def _attn_bwd_dkdv(dk, dv,  #
         if MASK:
             mask = (offs_m[None, :] >= offs_n[:, None])
             pT = tl.where(mask, pT, 0.0)
-        do = tl.load(do_ptrs)
+        do = do_desc.load([start_m + blk_idx * step_m, 0])
         # Compute dV.
         ppT = pT
         ppT = ppT.to(tl.float16)
@@ -247,8 +248,6 @@ def _attn_bwd_dkdv(dk, dv,  #
         dk += tl.dot(dsT, tl.trans(qT))
         # Increment pointers.
         curr_m += step_m
-        qT_ptrs += step_m * stride_tok
-        do_ptrs += step_m * stride_tok
     return dk, dv
 
 
@@ -267,19 +266,20 @@ def _attn_bwd_dq(dq, q, K, V,  #
                  start_m, start_n, num_steps,  #
                  MASK: tl.constexpr):
     offs_m = start_m + tl.arange(0, BLOCK_M2)
-    offs_n = start_n + tl.arange(0, BLOCK_N2)
-    offs_k = tl.arange(0, HEAD_DIM)
-    kT_ptrs = K + offs_n[None, :] * stride_tok + offs_k[:, None] * stride_d
-    vT_ptrs = V + offs_n[None, :] * stride_tok + offs_k[:, None] * stride_d
+    kT_desc = tl.make_tensor_descriptor(K, shape=[HEAD_DIM, N_CTX], strides=[stride_d, stride_tok],
+                                        block_shape=[HEAD_DIM, BLOCK_N2])
+
+    vT_desc = tl.make_tensor_descriptor(V, shape=[HEAD_DIM, N_CTX], strides=[stride_d, stride_tok],
+                                        block_shape=[HEAD_DIM, BLOCK_N2])
     # D (= delta) is pre-divided by ds_scale.
     Di = tl.load(D + offs_m)
     # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
     tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
     curr_n = start_n
     step_n = BLOCK_N2
     for blk_idx in range(num_steps):
-        kT = tl.load(kT_ptrs)
-        vT = tl.load(vT_ptrs)
+        kT = kT_desc.load([0, start_n + blk_idx * step_n])
+        vT = vT_desc.load([0, start_n + blk_idx * step_n])
         qk = tl.dot(q, kT)
         p = tl.math.exp2(qk - m)
         # Autoregressive masking.
@@ -296,8 +296,6 @@ def _attn_bwd_dq(dq, q, K, V,  #
         dq += tl.dot(ds, tl.trans(kT))
         # Increment pointers.
         curr_n += step_n
-        kT_ptrs += step_n * stride_tok
-        vT_ptrs += step_n * stride_tok
     return dq
 
 
@@ -508,7 +506,7 @@ def backward(ctx, do):
             dv = torch.empty_like(v)
             BATCH, N_HEAD, N_CTX = q.shape[:3]
             PRE_BLOCK = 128
-            NUM_WARPS, NUM_STAGES = 4, 5
+            NUM_WARPS, NUM_STAGES = 16, 3
             BLOCK_M1, BLOCK_N1, BLOCK_M2, BLOCK_N2 = 32, 128, 128, 32
             BLK_SLICE_FACTOR = 2
             RCP_LN2 = 1.4426950408889634  # = 1.0 / ln(2)
 
@@ -85,7 +85,7 @@ def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr
     tma.store_wait(0)
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer")
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_async_tma_kernel(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -141,7 +141,7 @@ def tma_interleave_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.cons
     tma.store_wait(0)
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer")
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_tma_interleave_kernel(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -190,7 +190,7 @@ def async_copy_kernel(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr):
     ampere.async_copy.wait_group(0)
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires ampere or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires ampere or newer")
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_async_copy(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -252,7 +252,7 @@ def tcgen5_mma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexp
     mbarrier.invalidate(bar.index(1))
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
 @pytest.mark.parametrize("FAILURE", [True, False])
 @pytest.mark.parametrize("MEM_ACCESS_KIND", ["tma_cp", "local_store", "tmem_load", "tmem_store"])
 def test_tcgen5_mma(FAILURE, MEM_ACCESS_KIND, device, run_wrapper):
@@ -305,7 +305,7 @@ def warpgroup_mma_kernel(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr)
     smemA.store(ttgl.full([XBLOCK, XBLOCK], 42, ttgl.float16, blocked_layout))
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_warpgroup_mma(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -353,7 +353,7 @@ def warpgroup_mma_kernel2(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr
     smemA.store(ttgl.full([XBLOCK, XBLOCK], 42, ttgl.float16, blocked_layout))
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_warpgroup_mma2(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -406,7 +406,7 @@ def tcgen5_mma_multibar_kernel(input_desc, XBLOCK: ttgl.constexpr, BUF_IDX: ttgl
         mbarrier.invalidate(bar.index(i))
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
 @pytest.mark.parametrize("BUF_IDX", [0, 1])
 @pytest.mark.parametrize("BAR_IDX", [0, 1, 2, 3])
 def test_tcgen5_mma_multibar(BUF_IDX, BAR_IDX, device, run_wrapper):
@@ -529,7 +529,7 @@ def multibuffered_loop_tma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: t
         mbarrier.invalidate(barMMA.index(i))
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_multibuffered_loop(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -611,7 +611,7 @@ def multibuffered_loop_wgmma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE:
         mbarrier.invalidate(barLoadB.index(i))
 
 
-@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_multibuffered_wgmma_loop(FAILURE, device, run_wrapper):
     if run_wrapper: