[HistogramOpToLLVM] Sync from 2a10b48

whitneywhtsang · whitneywhtsang · commit 942c79f3dfb1 · 2025-06-05T22:41:45.000Z
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp
@@ -11,8 +11,9 @@ using namespace mlir::triton;
 // only popcount those.
 static SmallVector<Value> computeWarpLevelHistogram(
     Location loc, RankedTensorType srcType, SmallVector<Value> &srcValues,
-    int numBins, int numThreadPerWarp, Value threadId,
-    ConversionPatternRewriter &rewriter, const TargetInfoBase &targetInfo) {
+    SmallVector<Value> &maskValues, int numBins, int numThreadPerWarp,
+    Value threadId, ConversionPatternRewriter &rewriter,
+    const TargetInfoBase &targetInfo) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   assert(numBins % numThreadPerWarp == 0 &&
          "numBins must be divisible by numThreadPerWarp");
@@ -49,6 +50,14 @@ static SmallVector<Value> computeWarpLevelHistogram(
       mask = b.and_(
           mask, b.xor_(ballotBits[i + numBits - numBitsLaneId], updateMask));
     }
+    // save a ballot bit to capture the input mask
+    Value inputMaskBit = fullMask;
+    if (maskValues.size() > 0) {
+      inputMaskBit = targetInfo.ballot(rewriter, loc, int_ty(numThreadPerWarp),
+                                       maskValues[i]);
+    }
+    // mask out the values for which input mask is invalid
+    mask = b.and_(mask, inputMaskBit);
     // at this point, 'mask' tells you which elements are in a bin owned by this
     // thread.
     for (int k = 0; k < warpLevelHistogram.size(); k++) {
@@ -158,6 +167,12 @@ struct HistogramOpConversion
     Value input = adaptor.getSrc();
     auto typeConverter = getTypeConverter();
     SmallVector<Value> srcValues = unpackLLElements(loc, input, rewriter);
+
+    Value llMask = adaptor.getMask();
+    SmallVector<Value> maskValues;
+    if (llMask)
+      maskValues = unpackLLElements(loc, llMask, rewriter);
+
     int numBins = op.getType().getDimSize(0);
     auto mod = op->getParentOfType<ModuleOp>();
     int numThreadsPerWarp =
@@ -173,8 +188,8 @@ struct HistogramOpConversion
     auto srcType = op.getSrc().getType();
     // First compute a warp local histogram based on values owned by each warps.
     SmallVector<Value> warpLevelHistogram = computeWarpLevelHistogram(
-        loc, srcType, srcValues, numBins, numThreadsPerWarp, threadId, rewriter,
-        targetInfo);
+        loc, srcType, srcValues, maskValues, numBins, numThreadsPerWarp,
+        threadId, rewriter, targetInfo);
 
     // Then use atomic to update the histogram in shared memory.
     // TODO: we could skip this for cases with num_warps=1 as long as we can