[ConSan] Make sure kernel is recompiled when consan state changes (#8342)

pawelszczerbuk · web-flow · commit 1d74879ec419 · 2025-10-02T23:37:46.000Z
Include ConSan state in the compilation options and kwargs of the kernel
to force jit cache miss. Also, preparing the knobs to unify this
behavior with proton by introducing `instrumentation_mode` compiltion
knob that can be set to "consan" or "proton" by proton runtime.
diff --git a/python/test/gluon/test_consan.py b/python/test/gluon/test_consan.py
@@ -86,40 +86,43 @@ def failing_kernel(input):
     ampere.async_copy.wait_group(0)
 
 
-def run_failing_kernel(device):
+def run_failing_kernel(device, enable_consan, mode):
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         return torch.empty(size, device="cuda", dtype=torch.int8)
 
     triton.set_allocator(alloc_fn)
 
+    if enable_consan:
+        if mode == "env":
+            os.environ["TRITON_INSTRUMENTATION_MODE"] = "consan"
+            knobs.refresh_knobs()
+        elif mode == "knob":
+            knobs.compilation.instrumentation_mode = "consan"
+
     input = torch.randn((XBLOCK, XBLOCK), device=device, dtype=torch.float16)
     failing_kernel[(1, )](input)
 
 
 @pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper")
-def test_cache_miss_knob(device, fresh_knobs, monkeypatch):
+def test_cache_miss_knob(device, monkeypatch):
     # First run without consan
-    knobs.compilation.enable_experimental_consan = False
-    run_failing_kernel(device)
+    run_in_process(run_failing_kernel, (device, False, "knob"))
 
     # Then run with consan and assert that if fails
-    knobs.compilation.enable_experimental_consan = True
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
-    result = run_in_process(run_failing_kernel, (device, ))
+    result = run_in_process(run_failing_kernel, (device, True, "knob"))
     assert "device-side assert" in str(result.exc)
 
 
 @pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper")
 def test_cache_miss_env(device, monkeypatch):
     # First run without consan
-    knobs.compilation.enable_experimental_consan = False
-    run_failing_kernel(device)
+    run_in_process(run_failing_kernel, (device, False, "env"))
 
     # Then run with consan and assert that if fails
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
-    result = run_in_process(run_failing_kernel, (device, ))
+    result = run_in_process(run_failing_kernel, (device, True, "env"))
     assert "device-side assert" in str(result.exc)
 
 
@@ -133,8 +136,9 @@ def test_async_tma_kernel(FAILURE, device, run_wrapper, monkeypatch):
             assert "Buffer being accessed has outstanding writes" in result.driver_stderr_output
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -182,8 +186,9 @@ def test_tma_interleave_kernel(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -240,8 +245,9 @@ def test_async_copy(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -292,8 +298,9 @@ def test_tcgen5_mma(FAILURE, MEM_ACCESS_KIND, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -355,8 +362,9 @@ def test_warpgroup_mma(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -399,8 +407,9 @@ def test_warpgroup_mma2(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -447,8 +456,9 @@ def test_tcgen5_mma_multibar(BUF_IDX, BAR_IDX, device, run_wrapper, monkeypatch)
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -505,8 +515,9 @@ def test_multibuffered_loop(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -619,8 +630,9 @@ def test_multibuffered_wgmma_loop(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -699,8 +711,9 @@ def test_ws_store_wait_load(FAILURE, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -752,8 +765,9 @@ def test_ws_load_wait_store(FAILURE, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -805,8 +819,9 @@ def test_ws_two_loads_two_bars(MISSING_BAR, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -867,8 +882,9 @@ def test_ws_two_loads_one_bar(FAILURE, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -929,8 +945,9 @@ def test_ws_two_loads_two_bars_loop(MISSING_BAR, device, run_wrapper, monkeypatc
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1009,8 +1026,9 @@ def test_ws_load_ordering(FAILURE, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1073,8 +1091,9 @@ def test_ws_two_producers_two_consumers(MISSING_BAR, device, run_wrapper, monkey
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1160,8 +1179,9 @@ def test_ws_different_warp_sizes(MISSING_BAR, device, run_wrapper, monkeypatch):
             assert result.exc is None
             assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1229,8 +1249,9 @@ def test_ws_async_copy_commits(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         return torch.empty(size, device="cuda", dtype=torch.int8)
@@ -1292,8 +1313,9 @@ def test_ws_async_copy_wait_visibility(FAILURE, device, run_wrapper, monkeypatch
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         return torch.empty(size, device="cuda", dtype=torch.int8)
@@ -1344,8 +1366,9 @@ def test_ws_wgmma_wait_visibility(FAILURE, device, run_wrapper, monkeypatch):
             assert result.driver_stderr_output == ""
         return
 
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         return torch.empty(size, device="cuda", dtype=torch.int8)
@@ -1392,8 +1415,9 @@ def test_deadlock_two_partitions(device, run_wrapper, monkeypatch):
         assert "device-side assert" in str(result.exc)
         assert "Deadlock detected" in result.driver_stderr_output
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1426,8 +1450,9 @@ def test_deadlock_overarrival(device, run_wrapper, monkeypatch):
         assert "device-side assert" in str(result.exc)
         assert "Deadlock detected" in result.driver_stderr_output
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1455,8 +1480,9 @@ def test_deadlock_underarrival(device, run_wrapper, monkeypatch):
         assert "device-side assert" in str(result.exc)
         assert "Deadlock detected" in result.driver_stderr_output
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1491,8 +1517,9 @@ def test_deadlock_different_phases(device, run_wrapper, monkeypatch):
         assert result.exc is None
         assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1526,8 +1553,9 @@ def test_deadlock_exempt_when_tma_signals(device, run_wrapper, monkeypatch):
         assert result.exc is None
         assert result.driver_stderr_output == ""
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1569,8 +1597,9 @@ def test_barrier_underflow(device, run_wrapper, monkeypatch):
         assert "device-side assert" in str(result.exc)
         assert "Barrier arrive underflow: current count would become negative" in result.driver_stderr_output
         return
-    monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
+    monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
     monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
+    knobs.refresh_knobs()
 
     # ConSan requires a global memory allocation
     def alloc_fn(size: int, alignment: int, stream: Optional[int]):
diff --git a/python/triton/knobs.py b/python/triton/knobs.py
@@ -364,7 +364,9 @@ class compilation_knobs(base_knobs):
     disable_line_info: env_bool = env_bool("TRITON_DISABLE_LINE_INFO")
     front_end_debugging: env_bool = env_bool("TRITON_FRONT_END_DEBUGGING")
     allow_non_constexpr_globals: env_bool = env_bool("TRITON_ALLOW_NON_CONSTEXPR_GLOBALS")
-    enable_experimental_consan: env_bool = env_bool("TRITON_ENABLE_EXPERIMENTAL_CONSAN")
+    # Instrumentation mode is checked on every run, which is expensive.
+    # We cache the value here to avoid the expensive check on every run.
+    instrumentation_mode: str = env_str("TRITON_INSTRUMENTATION_MODE", "").get()
     listener: Union[CompilationListener, None] = None
 
 
@@ -533,3 +535,4 @@ class proton_knobs(base_knobs):
 
 def refresh_knobs():
     runtime.debug = env_bool("TRITON_DEBUG").get()
+    compilation.instrumentation_mode = env_str("TRITON_INSTRUMENTATION_MODE", "").get()
diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py
@@ -650,6 +650,7 @@ def _pack_args(self, backend, kwargs, bound_args, specialization, options):
 
     def run(self, *args, grid, warmup, **kwargs):
         kwargs["debug"] = kwargs.get("debug", self.debug) or knobs.runtime.debug
+        kwargs["instrumentation_mode"] = knobs.compilation.instrumentation_mode
 
         # parse options
         device = driver.active.get_current_device()
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -170,6 +170,10 @@ def __init__(self, target: GPUTarget) -> None:
         self.binary_ext = "cubin"
 
     def parse_options(self, opts) -> Any:
+        # Enable debug mode for ConSan, so device-side assertions are not optimized out
+        if "instrumentation_mode" in opts and opts["instrumentation_mode"] == "consan":
+            opts["debug"] = True
+
         args = {'arch': knobs.runtime.override_arch or f"sm{self.target.arch}"}
         args.update({k: opts[k] for k in CUDAOptions.__dataclass_fields__.keys() if k in opts if opts[k] is not None})
         capability = int(self._parse_arch(args["arch"]))
@@ -353,7 +357,7 @@ def make_llir(self, src, metadata, options, capability):
         passes.gluon.add_inliner(pm)
         nvidia.passes.ttgpuir.add_allocate_shared_memory_nv(pm, capability, ptx_version)
         nvidia.passes.ttnvgpuir.add_allocate_tensor_memory(pm)
-        if knobs.compilation.enable_experimental_consan:
+        if knobs.compilation.instrumentation_mode == "consan":
             # Call ConcurrencySanitizerPass here, before allocating global scratch memory but after allocating tensor and shared
             passes.ttgpuir.add_concurrency_sanitizer(pm)
         passes.ttgpuir.add_allocate_global_scratch_memory(pm)