intel
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentOps.td‎
Lines changed: 29 additions & 16 deletions b/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentOps.td‎
Lines changed: 29 additions & 16 deletions
diff --git a/‎python/test/conftest.py‎
Lines changed: 2 additions & 2 deletions b/‎python/test/conftest.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 15 additions & 18 deletions b/‎python/test/gluon/test_core.py‎
Lines changed: 15 additions & 18 deletions
diff --git a/‎python/test/gluon/test_lowerings.py‎
Lines changed: 51 additions & 0 deletions b/‎python/test/gluon/test_lowerings.py‎
Lines changed: 51 additions & 0 deletions
@@ -8,6 +8,25 @@ include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "triton/Dialect/TritonInstrument/IR/TritonInstrumentAttrDefs.td"
 
+// Concurrency Sanitizer data structures:
+// ConSan keeps auxilary data requied for tracking memory accesses in tensors.
+// These tensors are stored as a distributed tensor or in global scratch memory.
+//
+// Tensor name   | Storage | Type     | Description
+// ------------- | ------- | -------- | -----------
+// buffers       | tensor  | <Bxi64>  | List of base pointers of all the buffers and sub-buffers in the program
+// barriers      | tensor  | <Mxi64>  | List of pointers to all individual mbarriers in the program
+// writeState    | scratch | <Mxi8>   | Marks which buffers are being written to.
+//               |         |          | Entries in this tensor are set when write operation is issued. Entries are bitfields, where:
+//               |         |          | - bit 0: 1 if the buffer is being written to
+//               |         |          | - bit 1: 1 if the write is *not* hwPipelined
+// writeBars     | scratch | <BxMxi8> | Which barriers track writes to which buffers.
+//               |         |          | Entries in this tensor are set when commit with barrier is called.
+// readBars      | scratch | <BxMxi8> | Which barriers track reads from which buffers.
+//               |         |          | Entries in this tensor are set when read operation with barrier is issued.
+// asyncCpCommits | scratch | <Bxi8>  | Tracks number of outstanding commits for buffers written with cp-async.
+// wgmmaCommits  | scratch | <Bxi8>  | Tracks number of outstanding commits for buffers being read by wgmma.
+
 //
 // Interfaces
 //
@@ -52,10 +71,6 @@ def TTI_ExperimentalCheckWriteStateOp : TTI_Op<"experimental_check_write_state",
   let description = [{
     Check if the writeState tensor has non-zero value associated with the buffer.
 
-    `writeState` is a tensor of 8b bitfields, where:
-    - bit 0: 1 if the buffer is being written to
-    - bit 1: 1 if the write is *not* hwPipelined
-
     If hwPipelined is true, shift the bitfield by 1 to check the second bit - this
     means that the error won't be triggered if another pipelined write is outstanding.
   }];
@@ -79,7 +94,7 @@ def TTI_ExperimentalCheckWriteStateOp : TTI_Op<"experimental_check_write_state",
 def TTI_ExperimentalCheckReadBarriersOp : TTI_Op<"experimental_check_read_barriers", [MemoryEffects<[MemWrite<GlobalMemory>]>]> {
   let summary = "check if there are outstanding reads from a buffer guarded by a mbar";
   let description = [{
-    Check if there are outstanding reads from a buffer guarded by a mbar.
+    Check if any of the entries in readBars in the row corresponding to the buffer is non-zero.
   }];
   let arguments = (ins
     TTG_MemDescType:$buf,
@@ -100,11 +115,7 @@ def TTI_ExperimentalSetWriteStateOp : TTI_Op<"experimental_set_write_state", [Me
   let description = [{
     Mark a buffer as being written to. It is not yet tracked by a barrier, until
     `commit_write_with_barrier` is called, at which point all the buffers being written
-    to are marked as tracked by the barrier.
-
-    `writeState` is a tensor of 8b bitfields, where:
-    - bit 0: 1 if the buffer is being written to
-    - bit 1: 1 if the write is *not* hwPipelined
+    to are marked as tracked by the barrier in writeBars tensor.
 
     If hwPipelined is true, the write won't trigger an error if another pipelined
     write is executed later without waiting for the barrier.
@@ -149,7 +160,7 @@ def TTI_ExperimentalCommitWriteWithBarrierOp : TTI_Op<"experimental_commit_write
 def TTI_ExperimentalSetReadBarrierOp : TTI_Op<"experimental_set_read_barrier", [MemoryEffects<[MemWrite<GlobalMemory>]>]> {
   let summary = "mark a buffer as being read from using mbar as a guard";
   let description = [{
-    Mark a buffer as being read from using mbar as a guard.
+    Set the entry under [buffer, mbar] in readBars tensor to 1, marking the buffer as tracked by the barrier.
   }];
   let arguments = (ins
     TTG_MemDescType:$buf,
@@ -170,7 +181,8 @@ def TTI_ExperimentalSetReadBarrierOp : TTI_Op<"experimental_set_read_barrier", [
 def TTI_ExperimentalClearWriteBarrierOp : TTI_Op<"experimental_clear_write_barrier", [MemoryEffects<[MemWrite<GlobalMemory>]>]> {
   let summary = "clear the write state for buffers being guarded by an mbar";
   let description = [{
-    Clear the write state for buffers being guarded by an mbar.
+    For each buffer that has [buffer, mbar] entry in writeBars tensor, set the corresponding entry in writeState tensor to 0.
+    Also, set the corresponding entry in writeBars tensor to 0.
   }];
   let arguments = (ins
     TTG_MemDescType:$mbar,
@@ -191,7 +203,7 @@ def TTI_ExperimentalClearWriteBarrierOp : TTI_Op<"experimental_clear_write_barri
 def TTI_ExperimentalClearReadBarrierOp : TTI_Op<"experimental_clear_read_barrier", [MemoryEffects<[MemWrite<GlobalMemory>]>]> {
   let summary = "clear the read state for buffers being guarded by an mbar";
   let description = [{
-    Clear the read state for buffers being guarded by an mbar.
+    Set all the entries in the column corresponding to the mbar in readBars tensor to 0.
   }];
   let arguments = (ins
     TTG_MemDescType:$mbar,
@@ -210,7 +222,7 @@ def TTI_ExperimentalClearReadBarrierOp : TTI_Op<"experimental_clear_read_barrier
 def TTI_ExperimentalCheckBarrierWritesClearedOp : TTI_Op<"experimental_check_barrier_writes_cleared", [MemoryEffects<[MemWrite<GlobalMemory>]>]> {
   let summary = "verify that the barrier is not used to track any writes";
   let description = [{
-    Verify that the barrier is not used to track any writes.
+    Verify that the column corresponding to the mbar in writeBars tensor is all 0.
   }];
   let arguments = (ins
     TTG_MemDescType:$mbar,
@@ -248,7 +260,8 @@ def TTI_ExperimentalStageAccessForCommitOp : TTI_Op<"experimental_stage_access_f
 def TTI_ExperimentalCommitAccessesOp : TTI_Op<"experimental_commit_accesses", [MemoryEffects<[MemWrite<GlobalMemory>]>]> {
   let summary = "Commit all the staged accesses for all the buffers.";
   let description = [{
-    Commit all the staged accesses for all the buffers.
+    Increment the value in outstandingCommits tensor for each entry greater than 0.
+    Change all the `-1` entries in outstandingCommits tensor to 1, signifying 1 outstanding commit.
   }];
   let arguments = (ins
     TT_PtrLike:$outstandingCommits,
@@ -277,7 +290,7 @@ def TTI_ExperimentalClearOutstandingCommitsOp : TTI_Op<"experimental_clear_outst
 def TTI_ExperimentalCheckOutstandingCommitsOp : TTI_Op<"experimental_check_outstanding_commits", [MemoryEffects<[MemWrite<GlobalMemory>]>]> {
   let summary = "Check if the buffer has an outstanding commit.";
   let description = [{
-    Check if the buffer has an outstanding commit.
+    Verify that the entry corresponding to the buffer in outstandingCommits tensor is 0.
   }];
   let arguments = (ins
     TTG_MemDescType:$buf,
 
@@ -89,7 +89,7 @@ def device(request):
 @pytest.fixture
 def fresh_triton_cache():
     from triton import knobs
-    with knobs.compilation.scope():
+    with knobs.compilation.scope(), knobs.runtime.scope():
         knobs.compilation.always_compile = True
         yield
 
@@ -100,7 +100,7 @@ def fresh_triton_cache_scope():
 
     @contextlib.contextmanager
     def fresh_cache():
-        with knobs.compilation.scope():
+        with knobs.compilation.scope(), knobs.runtime.scope():
             knobs.compilation.always_compile = True
             yield
 
 
@@ -5,7 +5,7 @@
 import triton
 import triton.language as tl
 
-from triton._internal_testing import is_cuda, is_ampere_or_newer, is_hip_cdna3, is_hip_cdna4, is_hopper_or_newer, is_hopper
+from triton._internal_testing import is_ampere_or_newer, is_hip_cdna3, is_hip_cdna4, is_hopper_or_newer, is_hopper
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
 from triton.experimental.gluon.language.nvidia.ampere import async_copy, mbarrier
@@ -14,6 +14,8 @@
 from triton.experimental.gluon.language.amd.cdna4 import async_copy as cdna4_async_copy
 from triton.experimental.gluon.language.extra import libdevice
 
+THREADS_PER_WARP = triton.runtime.driver.active.get_current_target().warp_size
+
 
 @gluon.jit
 def copy_kernel(Out, In, numel, XBLOCK: ttgl.constexpr, layout: ttgl.constexpr):
@@ -24,18 +26,15 @@ def copy_kernel(Out, In, numel, XBLOCK: ttgl.constexpr, layout: ttgl.constexpr):
     ttgl.store(Out + xoffset, data, xmask)
 
 
-copy_kernel_tpw = [32] if is_cuda() else [64]
-
-
 @pytest.mark.parametrize("layout", [
-    ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=copy_kernel_tpw, warps_per_cta=[4], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[2], threads_per_warp=copy_kernel_tpw, warps_per_cta=[4], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[4], threads_per_warp=copy_kernel_tpw, warps_per_cta=[4], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=copy_kernel_tpw, warps_per_cta=[4], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=copy_kernel_tpw, warps_per_cta=[8], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[2], threads_per_warp=copy_kernel_tpw, warps_per_cta=[8], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[4], threads_per_warp=copy_kernel_tpw, warps_per_cta=[8], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=copy_kernel_tpw, warps_per_cta=[8], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[4], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[2], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[4], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[4], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[4], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[4], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[8], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[2], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[8], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[4], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[8], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[8], order=[0]),
 ])
 @pytest.mark.parametrize("XBLOCK", [128, 256, 512, 1024, 2048])
 def test_copy_kernel(layout, XBLOCK):
@@ -403,13 +402,12 @@ def fast_expf_kernel(x_ptr, y_ptr, warp_size: ttgl.constexpr, num_warps: ttgl.co
         y = libdevice.fast_expf(x)
         ttgl.store(y_ptr + offs, y)
 
-    warp_size = 32 if is_cuda() else 64
     num_warps = 4
 
     torch.manual_seed(0)
-    x = torch.randn(warp_size * num_warps, device="cuda", dtype=torch.float32)
+    x = torch.randn(THREADS_PER_WARP * num_warps, device="cuda", dtype=torch.float32)
     y = torch.empty_like(x)
-    fast_expf_kernel[(1, )](x, y, warp_size, num_warps)
+    fast_expf_kernel[(1, )](x, y, THREADS_PER_WARP, num_warps)
     torch.testing.assert_close(y, torch.exp(x), atol=1e-5, rtol=1e-4)
 
 
@@ -425,13 +423,12 @@ def fast_dividef_kernel(x_ptr, y_ptr, z_ptr, warp_size: ttgl.constexpr, num_warp
         z = libdevice.fast_dividef(x, y)
         ttgl.store(z_ptr + offs, z)
 
-    warp_size = 32 if is_cuda() else 64
     num_warps = 4
 
     torch.manual_seed(0)
-    x = torch.randn(warp_size * num_warps, device="cuda", dtype=torch.float32)
+    x = torch.randn(THREADS_PER_WARP * num_warps, device="cuda", dtype=torch.float32)
     y = torch.randn_like(x)
     z = torch.empty_like(x)
     y[y == 0] = 1.0
-    fast_dividef_kernel[(1, )](x, y, z, warp_size, num_warps)
+    fast_dividef_kernel[(1, )](x, y, z, THREADS_PER_WARP, num_warps)
     torch.testing.assert_close(z, torch.div(x, y), atol=1e-5, rtol=1e-4)
@@ -0,0 +1,51 @@
+import torch
+import pytest
+
+import triton
+from triton.experimental import gluon
+from triton.experimental.gluon import language as ttgl
+
+THREADS_PER_WARP = triton.runtime.driver.active.get_current_target().warp_size
+
+
+@pytest.mark.parametrize("M, N", [(32, 16), (32, 32), (32, 64), (64, 32)])
+@pytest.mark.parametrize("src_layout", [
+    ttgl.BlockedLayout([1, 4], [4, THREADS_PER_WARP // 4], [4, 1], [0, 1]),
+    ttgl.BlockedLayout([1, 4], [8, THREADS_PER_WARP // 8], [4, 1], [0, 1]),
+    ttgl.BlockedLayout([4, 1], [4, THREADS_PER_WARP // 4], [1, 4], [0, 1]),
+    ttgl.BlockedLayout([2, 2], [4, THREADS_PER_WARP // 4], [2, 2], [0, 1]),
+    ttgl.BlockedLayout([2, 2], [8, THREADS_PER_WARP // 8], [2, 2], [0, 1]),
+    ttgl.BlockedLayout([1, 4], [4, THREADS_PER_WARP // 4], [4, 1], [1, 0]),
+    ttgl.BlockedLayout([1, 4], [8, THREADS_PER_WARP // 8], [4, 1], [1, 0]),
+    ttgl.BlockedLayout([4, 1], [4, THREADS_PER_WARP // 4], [1, 4], [1, 0]),
+    ttgl.BlockedLayout([2, 2], [4, THREADS_PER_WARP // 4], [2, 2], [1, 0]),
+    ttgl.BlockedLayout([2, 2], [8, THREADS_PER_WARP // 8], [2, 2], [1, 0]),
+    ttgl.BlockedLayout([1, 2], [1, THREADS_PER_WARP], [1, 4], [1, 0]),
+])
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("sanitize_overflow", [False, True])
+def test_scan_layouts(M, N, src_layout, axis, sanitize_overflow, device):
+
+    @gluon.jit
+    def _combine(a, b):
+        return a + b
+
+    @gluon.jit
+    def kernel(x_ptr, z_ptr, M: ttgl.constexpr, N: ttgl.constexpr, layout: ttgl.constexpr, axis: ttgl.constexpr):
+        x_offs_m = ttgl.arange(0, M, layout=ttgl.SliceLayout(1, layout))[:, None]
+        x_offs_n = ttgl.arange(0, N, layout=ttgl.SliceLayout(0, layout))[None, :]
+        x = ttgl.load(x_ptr + x_offs_m * N + x_offs_n)
+        y = ttgl.associative_scan(x, axis=axis, combine_fn=_combine)
+        ttgl.store(z_ptr + x_offs_m * N + x_offs_n, y)
+
+    torch.manual_seed(0)
+
+    x = torch.randint(-100, 100, (M, N), dtype=torch.int32, device=device)
+    z = torch.zeros((M, N), dtype=torch.int32, device=device)
+    z_tri = torch.empty_like(z)
+
+    kernel[(1, 1, 1)](x, z_tri, M, N, src_layout, axis, num_warps=4, sanitize_overflow=sanitize_overflow,
+                      debug=sanitize_overflow)
+
+    z_ref = torch.cumsum(x, dim=axis, dtype=torch.int32)
+    torch.testing.assert_close(z_tri, z_ref)