[Interpreter][histogram] Fix silent data corruption (#8550)

mieshkiwrk · web-flow · commit 33f077b379b4 · 2025-10-29T11:37:33.000-04:00
There's silent data corruption when calling `tl.histogram` with interpreter. ```python # test.py import torch import ctypes import triton import triton.language as tl @triton.jit def histogram_kernel(x_ptr, z_ptr): offset = tl.arange(0, 1) x = tl.load(x_ptr + offset) z = tl.histogram(x, 1) buf = (ctypes.c_int32 * 2).from_address(int(z_ptr)) print(f'before store: {list(buf)}') tl.store(z_ptr + offset, z) # tl.store treats z values as int64 while they're int32 print(f'after store: {list(buf)}') device = 'cpu' torch.manual_seed(17) x = torch.ones(1, device=device, dtype=torch.int32) z = torch.ones(2, dtype=torch.int32, device=device) histogram_kernel[(1, )](x, z) # Output: # TRITON_INTERPRET=1 TRITON_TEST_SUITE=interpreter python test.py # before store: [1, 1] # after store: [1, 0] <- second element shouldn't be cleared ``` Based on `np.histogram` docs: https://numpy.org/doc/2.3/reference/generated/numpy.histogram.html Returned dtype is taken account when optional weights param is passed, int64 othwerwise. That leads to `tl.store` thinking it's saving int64 values while there's int32 in my example tensor passed, so it's writing 8 bytes at once instead of 4 bytes, leading to writing 4 bytes exceeding it's data range causing silent data corruption. ```python import numpy as np data = np.array([1], dtype=np.int32) bins = 1 print(f'Data dtype before: {data.dtype}') histogram = np.histogram(data, bins=bins, range=(0, bins))[0] print(f'Data dtype after: {histogram.dtype}') # Data dtype before: int32 # Data dtype after: int64 ``` Applying "dummy_weights" fixes returned data type as expected fixing data corruption. ------------------------------  # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because np.histogram specific behavior with interpreter mode. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -2748,6 +2748,23 @@ def histogram_kernel(x_ptr, z_ptr, M: tl.constexpr, N: tl.constexpr):
     assert (z_torch == z).all()
 
 
+@pytest.mark.interpreter
+def test_histogram_silent_data_corruption(device):
+
+    @triton.jit
+    def histogram_kernel(x_ptr, z_ptr):
+        offset = tl.arange(0, 1)
+        x = tl.load(x_ptr + offset)
+        z = tl.histogram(x, 1)
+        tl.store(z_ptr + offset, z)
+
+    x = torch.ones(1, device=device, dtype=torch.int32)
+    z = torch.ones(2, device=device, dtype=torch.int32)
+
+    histogram_kernel[(1, )](x, z)
+    assert z[1] == 1, f"Second element shouldn't be affected, expected_buffer=[1, 1], actual_buffer={z}"
+
+
 # ------------------------
 # test histogram with mask
 # ------------------------
diff --git a/python/triton/runtime/interpreter.py b/python/triton/runtime/interpreter.py
@@ -603,9 +603,17 @@ def create_make_range(self, ret_ty, start, stop):
     def create_histogram(self, data, bins, mask):
         if mask is None:
             mask = TensorHandle(np.ones_like(data.data, dtype=bool), tl.int1)
+
+        # By default np.histogram returns int64 dtype values
+        # Docs specify that returned dtype is taken based on optional weights.dtype
+        # This is fix for interpreter cases where for example int32 tensor is being passed
+        # But unexpectedly int64 values are being returned causing
+        # tl.store to write 8 bytes instead of 4 bytes which lead to silent data corruption
+        dummy_weights = np.ones_like(data.data, dtype=data.data.dtype)
+
         # force all masked elements to zero
         data = np.where(mask.data, data.data, np.zeros_like(data.data))
-        histogram = np.histogram(data, bins=bins, range=(0, bins))[0]
+        histogram = np.histogram(data, bins=bins, range=(0, bins), weights=dummy_weights)[0]
         # remove overcounted elements
         histogram[0] -= np.logical_not(mask.data).sum()
         return TensorHandle(histogram, tl.int32)