intel
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp‎
Lines changed: 11 additions & 3 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎python/test/gluon/test_consan.py‎
Lines changed: 65 additions & 36 deletions b/‎python/test/gluon/test_consan.py‎
Lines changed: 65 additions & 36 deletions
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 0 additions & 2 deletions b/‎python/test/gluon/test_core.py‎
Lines changed: 0 additions & 2 deletions
@@ -54,6 +54,7 @@ class CoarseSchedule {
     iterator end() { return orderClusters.end(); }
     const_iterator end() const { return orderClusters.end(); }
     size_t size() const { return orderClusters.size(); }
+    void clear() { orderClusters.clear(); }
     iterator newAtBack() {
       orderClusters.push_back(orderClusters.size());
       return std::prev(orderClusters.end());
@@ -157,7 +158,10 @@ class CoarseSchedule {
   // Set <stage, cluster> based on CoarseSchedule.
   void serialize(scf::ForOp &forOp) const;
   // Create a CoarseSchedule based on forOp's <stage, cluster>.
-  LogicalResult deSerialize(scf::ForOp &forOp);
+  // If normalizeClusterId is true, clusters [minClusterId, maxClusterId] will
+  // be remapped to [0, maxClusterId - minClusterId].
+  // If false, it won't remap and clusters [0, maxClusterId] will be created.
+  LogicalResult deSerialize(scf::ForOp &forOp, bool normalizeClusterId = true);
 
   static ClusterHash hashCluster(Cluster cluster) {
     return reinterpret_cast<ClusterHash>(&*cluster);
 
@@ -263,7 +263,8 @@ void tt::CoarseSchedule::serialize(scf::ForOp &forOp) const {
 }
 
 // Create a CoarseSchedule based on forOp's <stage, cluster>.
-LogicalResult tt::CoarseSchedule::deSerialize(scf::ForOp &forOp) {
+LogicalResult tt::CoarseSchedule::deSerialize(scf::ForOp &forOp,
+                                              bool normalizeClusterId) {
   auto [minClusterId, maxClusterId] = getMinMaxCluster(forOp);
   std::optional<int> maxStage = tryGetMaxStage(forOp);
   if (!maxStage) {
@@ -272,9 +273,16 @@ LogicalResult tt::CoarseSchedule::deSerialize(scf::ForOp &forOp) {
   numStages = *maxStage + 1;
 
   DenseMap<int, tt::CoarseSchedule::Cluster> clustersMap;
-  for (int i = minClusterId; i < maxClusterId + 1; i++) {
-    clustersMap.insert({i, clusters.newAtBack()});
+  if (normalizeClusterId) {
+    for (int i = minClusterId; i < maxClusterId + 1; i++) {
+      clustersMap.insert({i, clusters.newAtBack()});
+    }
+  } else {
+    for (int i = 0; i < maxClusterId + 1; i++) {
+      clustersMap.insert({i, clusters.newAtBack()});
+    }
   }
+
   for (Operation &op : forOp.getBody()->without_terminator()) {
     if (!op.hasAttr(mlir::triton::kLoopStageAttrName))
       continue;
 
@@ -86,40 +86,43 @@ def failing_kernel(input):
     ampere.async_copy.wait_group(0)
 
 
-def run_failing_kernel(device):
+def run_failing_kernel(device, enable_consan, mode):
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         return torch.empty(size, device="cuda", dtype=torch.int8)
 
     triton.set_allocator(alloc_fn)
 
+    if enable_consan:
+        if mode == "env":
+            os.environ["TRITON_INSTRUMENTATION_MODE"] = "consan"
+            knobs.refresh_knobs()
+        elif mode == "knob":
+            knobs.compilation.instrumentation_mode = "consan"
+
     input = torch.randn((XBLOCK, XBLOCK), device=device, dtype=torch.float16)
     failing_kernel[(1, )](input)
 
 
 @pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper", run=False)
-def test_cache_miss_knob(device, fresh_knobs, monkeypatch):
+def test_cache_miss_knob(device, monkeypatch):
     # First run without consan
-    knobs.compilation.enable_experimental_consan = False
-    run_failing_kernel(device)
+    run_in_process(run_failing_kernel, (device, False, "knob"))
 
     # Then run with consan and assert that if fails
-    knobs.compilation.enable_experimental_consan = True
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
-    result = run_in_process(run_failing_kernel, (device, ))
+    result = run_in_process(run_failing_kernel, (device, True, "knob"))
     assert "device-side assert" in str(result.exc)
 
 
 @pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper", run=False)
 def test_cache_miss_env(device, monkeypatch):
     # First run without consan
-    knobs.compilation.enable_experimental_consan = False
-    run_failing_kernel(device)
+    run_in_process(run_failing_kernel, (device, False, "env"))
 
     # Then run with consan and assert that if fails
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
-    result = run_in_process(run_failing_kernel, (device, ))
+    result = run_in_process(run_failing_kernel, (device, True, "env"))
     assert "device-side assert" in str(result.exc)
 
 
@@ -134,8 +137,9 @@ def test_async_tma_kernel(FAILURE, device, run_wrapper, monkeypatch):
             assert "Buffer being accessed has outstanding writes" in result.driver_stderr_output
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -184,8 +188,9 @@ def test_tma_interleave_kernel(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -243,8 +248,9 @@ def test_async_copy(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -296,8 +302,9 @@ def test_tcgen5_mma(FAILURE, MEM_ACCESS_KIND, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -359,8 +366,9 @@ def test_warpgroup_mma(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -403,8 +411,9 @@ def test_warpgroup_mma2(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -452,8 +461,9 @@ def test_tcgen5_mma_multibar(BUF_IDX, BAR_IDX, device, run_wrapper, monkeypatch)
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -511,8 +521,9 @@ def test_multibuffered_loop(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -625,8 +636,9 @@ def test_multibuffered_wgmma_loop(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -705,8 +717,9 @@ def test_ws_store_wait_load(FAILURE, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -758,8 +771,9 @@ def test_ws_load_wait_store(FAILURE, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -811,8 +825,9 @@ def test_ws_two_loads_two_bars(MISSING_BAR, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -873,8 +888,9 @@ def test_ws_two_loads_one_bar(FAILURE, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -935,8 +951,9 @@ def test_ws_two_loads_two_bars_loop(MISSING_BAR, device, run_wrapper, monkeypatc
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1015,8 +1032,9 @@ def test_ws_load_ordering(FAILURE, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1079,8 +1097,9 @@ def test_ws_two_producers_two_consumers(MISSING_BAR, device, run_wrapper, monkey
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1166,8 +1185,9 @@ def test_ws_different_warp_sizes(MISSING_BAR, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1236,8 +1256,9 @@ def test_ws_async_copy_commits(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         return torch.empty(size, device="cuda", dtype=torch.int8)
@@ -1300,8 +1321,9 @@ def test_ws_async_copy_wait_visibility(FAILURE, device, run_wrapper, monkeypatch
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         return torch.empty(size, device="cuda", dtype=torch.int8)
@@ -1352,8 +1374,9 @@ def test_ws_wgmma_wait_visibility(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         return torch.empty(size, device="cuda", dtype=torch.int8)
@@ -1400,8 +1423,9 @@ def test_deadlock_two_partitions(device, run_wrapper, monkeypatch):
         assert "device-side assert" in str(result.exc)
         assert "Deadlock detected" in result.driver_stderr_output
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1434,8 +1458,9 @@ def test_deadlock_overarrival(device, run_wrapper, monkeypatch):
         assert "device-side assert" in str(result.exc)
         assert "Deadlock detected" in result.driver_stderr_output
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1463,8 +1488,9 @@ def test_deadlock_underarrival(device, run_wrapper, monkeypatch):
         assert "device-side assert" in str(result.exc)
         assert "Deadlock detected" in result.driver_stderr_output
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1499,8 +1525,9 @@ def test_deadlock_different_phases(device, run_wrapper, monkeypatch):
         assert result.exc is None
         assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1534,8 +1561,9 @@ def test_deadlock_exempt_when_tma_signals(device, run_wrapper, monkeypatch):
         assert result.exc is None
         assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1577,8 +1605,9 @@ def test_barrier_underflow(device, run_wrapper, monkeypatch):
         assert "device-side assert" in str(result.exc)
         assert "Barrier arrive underflow: current count would become negative" in result.driver_stderr_output
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
 
@@ -696,8 +696,6 @@ def kernel(in_ptr, out_ptr, smem_h: ttgl.constexpr, smem_w: ttgl.constexpr, num_
         tmem_alias: ttgl.constexpr = TensorMemoryLayout((num_rows, num_cols), col_stride=1)
         tmem = tmem._reinterpret(ttgl.int8, (num_rows, num_cols), tmem_alias)
         value = tmem.load(blocked)
-        ttgl.static_print(ttgl.to_linear_layout(blocked, (smem_h, smem_w)))
-        ttgl.static_print(ttgl.to_linear_layout(blocked, (num_rows, num_cols)))
         ttgl.store(ttgl.set_auto_layout(out_ptrs, blocked), value)
 
     torch.manual_seed(0)