Add support for masked histograms (#6695)

jhapradip · web-flow · commit 2a10b48d84b6 · 2025-06-03T14:52:26.000-07:00
# New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -865,19 +865,24 @@ def TT_ElementwiseInlineAsmOp : TT_Op<"elementwise_inline_asm", [
 //
 // Histogram Op
 //
-def TT_HistogramOp : TT_Op<"histogram", [Pure]> {
+def TT_HistogramOp : TT_Op<"histogram", [Pure,
+    TypesMatchWith<"mask type matches src type",
+                 "src", "mask", "getI1SameShape($_self)",
+                 "($_op.getOperands().size() <= 1) || std::equal_to<>()">]> {
   let summary = "return a histogram of the inputs.";
   let description = [{
     Return the histogram of the input tensor. The number of bins is equal to
     the dimension of the output tensor. Each bins has a width of 1 and bins
     start at 0.
   }];
 
-  let arguments = (ins TT_IntTensor:$src);
+  let arguments = (ins TT_IntTensor:$src,
+    Optional<TT_BoolLike>:$mask);
+
   let results = (outs TT_IntTensor:$result);
 
   let assemblyFormat = [{
-    $src attr-dict `:` type($src) `->` type($result)
+    $src (`,` $mask^)? attr-dict `:` type($src) `->` type($result)
   }];
 }
 
diff --git a/lib/Conversion/TritonGPUToLLVM/HistogramOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/HistogramOpToLLVM.cpp
@@ -15,8 +15,9 @@ using namespace mlir::triton::gpu;
 // only popcount those.
 static SmallVector<Value> computeWarpLevelHistogram(
     Location loc, RankedTensorType srcType, SmallVector<Value> &srcValues,
-    int numBins, int numThreadPerWarp, Value threadId,
-    ConversionPatternRewriter &rewriter, const TargetInfoBase &targetInfo) {
+    SmallVector<Value> &maskValues, int numBins, int numThreadPerWarp,
+    Value threadId, ConversionPatternRewriter &rewriter,
+    const TargetInfoBase &targetInfo) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   assert(numBins % numThreadPerWarp == 0 &&
          "numBins must be divisible by numThreadPerWarp");
@@ -53,6 +54,14 @@ static SmallVector<Value> computeWarpLevelHistogram(
       mask = b.and_(
           mask, b.xor_(ballotBits[i + numBits - numBitsLaneId], updateMask));
     }
+    // save a ballot bit to capture the input mask
+    Value inputMaskBit = fullMask;
+    if (maskValues.size() > 0) {
+      inputMaskBit = targetInfo.ballot(rewriter, loc, int_ty(numThreadPerWarp),
+                                       maskValues[i]);
+    }
+    // mask out the values for which input mask is invalid
+    mask = b.and_(mask, inputMaskBit);
     // at this point, 'mask' tells you which elements are in a bin owned by this
     // thread.
     for (int k = 0; k < warpLevelHistogram.size(); k++) {
@@ -159,6 +168,12 @@ struct HistogramOpConversion
     Value input = adaptor.getSrc();
     auto typeConverter = getTypeConverter();
     SmallVector<Value> srcValues = unpackLLElements(loc, input, rewriter);
+
+    Value llMask = adaptor.getMask();
+    SmallVector<Value> maskValues;
+    if (llMask)
+      maskValues = unpackLLElements(loc, llMask, rewriter);
+
     int numBins = op.getType().getDimSize(0);
     auto mod = op->getParentOfType<ModuleOp>();
     int numThreadsPerWarp =
@@ -174,8 +189,8 @@ struct HistogramOpConversion
     auto srcType = op.getSrc().getType();
     // First compute a warp local histogram based on values owned by each warps.
     SmallVector<Value> warpLevelHistogram = computeWarpLevelHistogram(
-        loc, srcType, srcValues, numBins, numThreadsPerWarp, threadId, rewriter,
-        targetInfo);
+        loc, srcType, srcValues, maskValues, numBins, numThreadsPerWarp,
+        threadId, rewriter, targetInfo);
 
     // Then use atomic to update the histogram in shared memory.
     // TODO: we could skip this for cases with num_warps=1 as long as we can
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -119,12 +119,24 @@ struct CanonicalizeConvertFromHistogram
   mlir::LogicalResult
   matchAndRewrite(triton::HistogramOp op,
                   PatternRewriter &rewriter) const override {
-    auto convert = op.getSrc().getDefiningOp<ConvertLayoutOp>();
-    if (!convert)
+    auto src = op.getSrc();
+    auto convert = src.getDefiningOp<ConvertLayoutOp>();
+    if (!convert) {
       return failure();
+    }
+    src = convert.getSrc();
+
+    // If mask is present, convert the layout of mask to match new src layout
+    auto mask = op.getMask();
+    if (mask) {
+      auto sharedType = getI1SameShape(src.getType());
+      rewriter.setInsertionPoint(op);
+      mask = rewriter.create<ConvertLayoutOp>(op.getLoc(), sharedType, mask);
+    }
+
     rewriter.replaceOpWithNewOp<triton::HistogramOp>(
-        op, op->getResult(0).getType(), convert.getSrc());
-    return mlir::success();
+        op, op->getResult(0).getType(), src, mask);
+    return success();
   }
 };
 
@@ -263,7 +275,8 @@ struct CanonicalizeConvertFromConvert
       // For histogram ops the input and output layouts are independent, so we
       // can always fold convert into the histogram op.
       rewriter.replaceOpWithNewOp<HistogramOp>(op, op->getResult(0).getType(),
-                                               histogram.getSrc());
+                                               histogram.getSrc(),
+                                               histogram.getMask());
       return success();
     }
 
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -1667,12 +1667,21 @@ void init_triton_ir(py::module &&m) {
              return self.create<ub::PoisonOp>(type);
            })
       .def("create_histogram",
-           [](TritonOpBuilder &self, Value operand, int numBins) -> Value {
-             return self.create<HistogramOp>(
-                 RankedTensorType::get(
-                     {static_cast<int64_t>(numBins)},
-                     IntegerType::get(operand.getContext(), 32)),
-                 operand);
+           [](TritonOpBuilder &self, Value operand, int numBins,
+              std::optional<Value> mask) -> Value {
+             if (!mask) {
+               return self.create<HistogramOp>(
+                   RankedTensorType::get(
+                       {static_cast<int64_t>(numBins)},
+                       IntegerType::get(operand.getContext(), 32)),
+                   operand);
+             } else {
+               return self.create<HistogramOp>(
+                   RankedTensorType::get(
+                       {static_cast<int64_t>(numBins)},
+                       IntegerType::get(operand.getContext(), 32)),
+                   operand, *mask);
+             }
            })
       .def("create_gather",
            [](TritonOpBuilder &self, Value src, Value indices, int axis)
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -2858,6 +2858,36 @@ def histogram_kernel(x_ptr, z_ptr, M: tl.constexpr, N: tl.constexpr):
     assert (z_torch == z).all()
 
 
+# ------------------------
+# test histogram with mask
+# ------------------------
+
+
+@pytest.mark.interpreter
+@pytest.mark.parametrize("M, N", [[2048, 2], [1024, 8], [1024, 128], [256, 512], [32, 512], [8, 512], [8, 2]])
+def test_histogram_mask(M, N, device):
+
+    @triton.jit
+    def histogram_kernel(x_ptr, z_ptr, M: tl.constexpr, N: tl.constexpr):
+        offset1 = tl.arange(0, 2 * M)
+        offset2 = tl.arange(0, N)
+        mask = offset1 < M
+        x = tl.load(x_ptr + offset1)
+        z = tl.histogram(x, N, mask)
+        tl.store(z_ptr + offset2, z)
+
+    torch.manual_seed(17)
+    x1 = torch.randint(0, N, (M, ), device=device, dtype=torch.int32)
+    x = torch.cat((x1, x1), 0)
+    z = torch.empty(N, dtype=torch.int32, device=device)
+    # torch.histc does not work when the input type is not float and the device is CPU
+    # https://github.com/pytorch/pytorch/issues/74236
+    # This is a workload by converting the input to float
+    z_torch = torch.histc(x1.float(), bins=N, min=0, max=N - 1)
+    histogram_kernel[(1, )](x, z, M=M, N=N)
+    assert (z_torch == z).all()
+
+
 @pytest.mark.parametrize("M, N", [(1, 64), (2, 32), (4, 16), (8, 8), (16, 4), (32, 2), (64, 1)])
 def test_scan_1d(M, N, device):
 
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -2733,17 +2733,22 @@ def make_combine_region(scan_op):
 
 @_tensor_member_fn
 @builtin
-def histogram(input, num_bins, _builder=None, _generator=None):
+def histogram(input, num_bins, mask=None, _builder=None, _generator=None):
     """computes an histogram based on input tensor with num_bins bins, the bins have a width of 1 and start at 0.
 
     :param input: the input tensor
     :type input: Tensor
     :param num_bins: number of histogram bins
     :type num_bins: int
+    :param mask: if `mask[idx]` is false, exclude `input[idx]` from histogram
+    :type mask: Block of `triton.int1`, optional
 
     """
     num_bins = _unwrap_if_constexpr(num_bins)
-    return semantic.histogram(input, num_bins, _builder)
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        mask = semantic.to_tensor(mask, _builder)
+    return semantic.histogram(input, num_bins, mask, _builder)
 
 
 @_tensor_member_fn
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1805,10 +1805,15 @@ def gather(src: tl.tensor, index: tl.tensor, axis: int, builder: ir.builder) ->
 # ===----------------------------------------------------------------------===
 
 
-def histogram(input: tl.tensor, num_bins: int, builder: ir.builder) -> tl.tensor:
+def histogram(input: tl.tensor, num_bins: int, mask: Optional[tl.tensor], builder: ir.builder) -> tl.tensor:
     assert len(input.shape) == 1, "histogram only supports 1D input"
     assert input.dtype.is_int(), "histogram only supports integer input"
-    return tl.tensor(builder.create_histogram(input.handle, num_bins), tl.block_type(tl.int32, [num_bins]))
+    if mask is not None:
+        mask = broadcast_impl_shape(mask, input.shape, builder)
+        if not mask.type.scalar.is_bool():
+            raise ValueError("Mask must have boolean scalar type")
+        mask = mask.handle
+    return tl.tensor(builder.create_histogram(input.handle, num_bins, mask), tl.block_type(tl.int32, [num_bins]))
 
 
 def multiple_of(x: tl.tensor, values: List[int]) -> tl.tensor:
diff --git a/python/triton/runtime/interpreter.py b/python/triton/runtime/interpreter.py
@@ -598,8 +598,15 @@ def create_dot(self, a, b, d, input_precision, max_num_imprecise_acc):
     def create_make_range(self, ret_ty, start, stop):
         return TensorHandle(np.arange(start, stop, dtype=np.int32), tl.int32)
 
-    def create_histogram(self, data, bins):
-        return TensorHandle(np.histogram(data.data, bins=bins, range=(0, bins))[0], tl.int32)
+    def create_histogram(self, data, bins, mask):
+        if mask is None:
+            mask = TensorHandle(np.ones_like(data.data, dtype=bool), tl.int1)
+        # force all masked elements to zero
+        data = np.where(mask.data, data.data, np.zeros_like(data.data))
+        histogram = np.histogram(data, bins=bins, range=(0, bins))[0]
+        # remove overcounted elements
+        histogram[0] -= np.logical_not(mask.data).sum()
+        return TensorHandle(histogram, tl.int32)
 
     def create_gather(self, src, indices, axis):
         return TensorHandle(np.take_along_axis(src.data, indices.data, axis=axis), src.dtype.scalar)
diff --git a/test/Triton/ops.mlir b/test/Triton/ops.mlir
@@ -243,6 +243,13 @@ tt.func @histogram(%0: tensor<512xi32>) {
   tt.return
 }
 
+// CHECK-LABEL: masked_histogram
+tt.func @masked_histogram(%0: tensor<512xi32>, %1: tensor<512xi1>) {
+  // CHECK: tt.histogram %{{.+}}, %{{.+}} : tensor<512xi32> -> tensor<16xi32>
+  %2 = tt.histogram %0, %1 : tensor<512xi32> -> tensor<16xi32>
+  tt.return
+}
+
 // CHECK-LABEL: descriptor_load
 tt.func @descriptor_load(%0: !tt.tensordesc<tensor<128xf32>>) {
   // CHECK: tt.descriptor_load %{{.+}}[%{{.+}}] : !tt.tensordesc<tensor<128xf32>> -> tensor<128xf32>
diff --git a/test/TritonGPU/canonicalize.mlir b/test/TritonGPU/canonicalize.mlir
@@ -81,20 +81,22 @@ tt.func @test_canonicalize_convert_view(%arg0: tensor<64x64xf32, #blocked0>) ->
 // -----
 
 // CHECK-LABEL: @test_canonicalize_convert_histogram
-// CHECK-SAME: (%[[ARG:.+]]: tensor<256xi32
-//   CHECK-NOT:   ttg.convert_layout
-//       CHECK:   %[[V:.+]] = tt.histogram %[[ARG]]
+// CHECK-SAME: (%[[SRC:.+]]: tensor<256xi32
+// CHECK-SAME: %[[MASK:.+]]: tensor<256xi1
+//       CHECK:   %[[M:.+]] = ttg.convert_layout %[[MASK]]
+//       CHECK:   %[[V:.+]] = tt.histogram %[[SRC]], %[[M]]
 //   CHECK-NOT:   ttg.convert_layout
 //       CHECK:   tt.return %[[V]]
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32, "ttg.target" = "cuda:80"} {
-tt.func @test_canonicalize_convert_histogram(%arg0: tensor<256xi32, #blocked1>) -> tensor<512xi32, #blocked2> {
+tt.func @test_canonicalize_convert_histogram(%arg0: tensor<256xi32, #blocked1>, %arg1: tensor<256xi1, #blocked2>) -> tensor<512xi32, #blocked2> {
     %0 = ttg.convert_layout %arg0 : tensor<256xi32, #blocked1> -> tensor<256xi32, #blocked>
-    %1 = tt.histogram %0 : tensor<256xi32, #blocked> -> tensor<512xi32, #blocked>
-    %2 = ttg.convert_layout %1 : tensor<512xi32, #blocked> -> tensor<512xi32, #blocked2>
-    tt.return %2 : tensor<512xi32, #blocked2>
+    %1 = ttg.convert_layout %arg1 : tensor<256xi1, #blocked2> -> tensor<256xi1, #blocked>
+    %2 = tt.histogram %0, %1 : tensor<256xi32, #blocked> -> tensor<512xi32, #blocked>
+    %3 = ttg.convert_layout %2 : tensor<512xi32, #blocked> -> tensor<512xi32, #blocked2>
+    tt.return %3 : tensor<512xi32, #blocked2>
 }
 }  // end module