[TEST] Do not pass tl.constexpr as an argument to a kernel and fix leaking hook (#7682)

lezcano · web-flow · commit 9311c406afb5 · 2025-07-28T19:32:15.000Z
It is breaking CI
diff --git a/python/triton_kernels/tests/test_specialize.py b/python/triton_kernels/tests/test_specialize.py
@@ -53,32 +53,36 @@ def cache_hook(*args, **kwargs):
         fn_name = kwargs["fn"].name
         module_name = kwargs["fn"].module
 
-    triton.knobs.runtime.jit_cache_hook = cache_hook
-    o = torch.empty((1, ), dtype=torch.float32, device=device)
-    k = specialized_kernel[(1, )](o, )
-    hash = k.hash
-    assert o.item() == 1.0
-    assert module_name == "tests.test_specialize"
-    assert fn_name == "cacheable_kernel"
-
-    compile_count = 0
-
-    def count_hook(*args, **kwargs):
-        nonlocal compile_count
-        compile_count += 1
-
-    triton.knobs.runtime.jit_cache_hook = count_hook
-    # clear the cache
-    specialized_kernel.device_caches.clear()
-
-    # retrieve the kernel from name and preload it.
-    fn = retrieve_fn(module_name, fn_name)
-    assert fn == specialized_kernel
-    preload = fn.preload(specialization_data)
-    assert compile_count == 1
-    assert preload.hash == hash
-
-    # verify that we hit the cache.
-    compile_count = 0
-    specialized_kernel[(1, )](o, )
-    assert compile_count == 0
+    prev_hook = triton.knobs.runtime.jit_cache_hook
+    try:
+        triton.knobs.runtime.jit_cache_hook = cache_hook
+        o = torch.empty((1, ), dtype=torch.float32, device=device)
+        k = specialized_kernel[(1, )](o, )
+        hash = k.hash
+        assert o.item() == 1.0
+        assert module_name == "tests.test_specialize"
+        assert fn_name == "cacheable_kernel"
+
+        compile_count = 0
+
+        def count_hook(*args, **kwargs):
+            nonlocal compile_count
+            compile_count += 1
+
+        triton.knobs.runtime.jit_cache_hook = count_hook
+        # clear the cache
+        specialized_kernel.device_caches.clear()
+
+        # retrieve the kernel from name and preload it.
+        fn = retrieve_fn(module_name, fn_name)
+        assert fn == specialized_kernel
+        preload = fn.preload(specialization_data)
+        assert compile_count == 1
+        assert preload.hash == hash
+
+        # verify that we hit the cache.
+        compile_count = 0
+        specialized_kernel[(1, )](o, )
+        assert compile_count == 0
+    finally:
+        triton.knobs.runtime.jit_cache_hook = prev_hook
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp.py
@@ -52,7 +52,7 @@ def downcast_to_mxfp(src_tensor: torch.Tensor, out_quant_type: torch.dtype, axis
         kernel_scale = out_scale.view(-1, out_scale.shape[-1])
 
         BLOCK_OUT_DIM = 128
-        BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE
+        BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value
         grid_out = triton.cdiv(kernel_src_tensor.shape[0], BLOCK_OUT_DIM)
         grid_quant = triton.cdiv(kernel_src_tensor.shape[1], BLOCK_QUANT_DIM)
 
@@ -93,7 +93,7 @@ def upcast_from_mxfp(tensor: torch.Tensor, scale: torch.Tensor, dtype: torch.dty
     reshaped_tensor = tensor.view(-1, tensor.shape[-1])
     reshaped_scale = scale.view(-1, scale.shape[-1])
     BLOCK_OUT_DIM = 128
-    BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE
+    BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value
     blocks_out_dim = triton.cdiv(reshaped_out.shape[0], BLOCK_OUT_DIM)
     blocks_quant_dim = triton.cdiv(reshaped_out.shape[1], BLOCK_QUANT_DIM)
     _upcast_from_mxfp[(blocks_out_dim, blocks_quant_dim)](reshaped_out, *reshaped_out.stride(), reshaped_scale,