[Cherry-pick][RESOLVED] [GLUON] Add histogram support (#7989) (#540)

agron911 · meta-codesync[bot] · commit a0fe0e379986 · 2025-10-31T21:07:25.000-07:00
Summary: ⚠️ **MERGE CONFLICTS DETECTED** ⚠️ This cherry-pick contains merge conflicts that require manual resolution. Original Commit: bfffc33 Original Author: Thomas Raoux Original Date: 2025-08-27 14:14:41 -0700 **Action Required:** 1. Check out this branch locally 2. Resolve the merge conflicts in the affected files 3. Commit the resolved changes 4. Update this PR Original commit message: ``` [GLUON] Add histogram support (#7989) Expose histogram op in gluon. This will be convenient to test more complex layouts. ``` This PR was automatically cherry-picked from the upstream triton-lang/triton repository. The conflicts have been committed with conflict markers for easier resolution. Pull Request resolved: #540 Reviewed By: dshi7 Differential Revision: D85968323 Pulled By: agron911 fbshipit-source-id: f4e30114ab3a83973d5dab42d19633cddb19d49f
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -436,6 +436,20 @@ void init_gluon_ir(py::module &&m) {
              auto dstTy = cast<RankedTensorType>(resultTy);
              return isConvertLayoutTrivial(dstTy, value);
            })
+      .def("create_histogram",
+           [](GluonOpBuilder &self, Value operand, int numBins,
+              std::optional<Value> mask, Attribute layout) -> Value {
+             auto *ctx = self.getContext();
+             auto resultTy =
+                 RankedTensorType::get({static_cast<int64_t>(numBins)},
+                                       IntegerType::get(ctx, 32), layout);
+             if (!mask) {
+               return self.create<triton::HistogramOp>(resultTy, operand);
+             } else {
+               return self.create<triton::HistogramOp>(resultTy, operand,
+                                                       *mask);
+             }
+           })
       .def("create_async_copy_global_to_local",
            [](GluonOpBuilder &self, Value smem, Value pointer, Value mask,
               Value other, tt::CacheModifier cacheModifier,
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -75,6 +75,18 @@ def test_convert_layout(target):
 """)
 
 
+@filecheck_test
+@gluon.jit
+def test_histogram_frontend():
+    # CHECK: #blocked = #ttg.blocked
+    # CHECK-LABEL: test_histogram_frontend
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1], [32], [4], [0])
+    x = ttgl.arange(0, 256, layout=layout)
+    m = x < 128
+    # CHECK: tt.histogram %{{.*}}, %{{.*}} : tensor<256xi32, #blocked> -> tensor<512xi32, #blocked>
+    _ = ttgl.histogram(x, 512, mask=m, layout=layout)
+
+
 @filecheck_test
 @gluon.jit
 def test_convert_layout_assert_trivial():
diff --git a/python/triton/experimental/gluon/language/__init__.py b/python/triton/experimental/gluon/language/__init__.py
@@ -46,6 +46,7 @@
     device_assert,
     expand_dims,
     full,
+    histogram,
     inline_asm_elementwise,
     join,
     load,
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -396,6 +396,27 @@ def full(shape, value, dtype, layout=None, _semantic=None):
     return _semantic.full(shape, value, dtype, layout)
 
 
+@builtin
+def histogram(input, num_bins, mask=None, layout=None, _semantic=None, _generator=None):
+    """
+    Compute a histogram of a 1D integer tensor.
+
+    Args:
+        input (tensor): 1D tensor of integer values.
+        num_bins (int): Number of bins. Bins have width 1 and start at 0.
+        mask (Optional[tensor]): Boolean mask to exclude elements when False.
+        layout (DistributedLayout): Destination layout of the output histogram.
+
+    Returns:
+        tensor: 1D int32 tensor of length `num_bins` with the requested layout.
+    """
+    num_bins = _unwrap_if_constexpr(num_bins)
+    layout = _unwrap_if_constexpr(layout)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+    return _semantic.histogram(input, num_bins, mask, layout)
+
+
 @builtin
 def allocate_shared_memory(element_ty, shape, layout, value=None, _semantic=None) -> shared_memory_descriptor:
     """
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -346,6 +346,18 @@ def reduction(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn) ->
             self._wrap_handle_infer_layout(reduce_op.get_result(i), inputs[i].type.scalar, ret_shape)
             for i in range(len(inputs)))
 
+    def histogram(self, input: TensorTy, num_bins: int, mask: TensorTy, layout) -> TensorTy:
+        _check(len(input.shape) == 1, lambda: "histogram only supports 1D input")
+        _check(input.dtype.is_int(), lambda: "histogram only supports integer input")
+        _check(layout is not None, lambda: "histogram requires a destination layout")
+        if mask is not None:
+            mask, input = self.broadcast_impl_value(mask, input)
+            _check(mask.type.scalar.is_bool(), lambda: "Mask must have boolean scalar type")
+            mask = mask.handle
+        layout_attr = layout._to_ir(self.builder)
+        handle = self.builder.create_histogram(input.handle, num_bins, mask, layout_attr)
+        return self.wrap_tensor(handle, ttgl.int32, [num_bins], layout)
+
     def warp_specialize(self, default_args, default_partition, worker_args, worker_partitions,
                         worker_num_warps: Sequence[int], worker_num_regs: Sequence[int], generator):
         num_partitions = len(worker_partitions)