[HistogramOpToLLVM] Sync from 078954b (#5248)

HBN-MichalSzy · web-flow · commit ecf19976aacb · 2025-10-02T13:22:03.000-04:00
This PR fixes #5148
diff --git a/scripts/skiplist/a770/gluon.txt b/scripts/skiplist/a770/gluon.txt
@@ -1,6 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5148
-python/test/gluon/test_lowerings.py::test_histogram[2048-2-src_layout3-dst_layout3]
-python/test/gluon/test_lowerings.py::test_histogram[32-32-src_layout4-dst_layout4]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/5149
 python/test/gluon/test_lowerings.py::test_scan_layouts[r"True-.*"]@regexp
 python/test/gluon/test_lowerings.py::test_reduce_layouts[r".*-True-.*"]@regexp
diff --git a/scripts/skiplist/arl-h/gluon.txt b/scripts/skiplist/arl-h/gluon.txt
@@ -1,6 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5148
-python/test/gluon/test_lowerings.py::test_histogram[2048-2-src_layout3-dst_layout3]
-python/test/gluon/test_lowerings.py::test_histogram[32-32-src_layout4-dst_layout4]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/5149
 python/test/gluon/test_lowerings.py::test_scan_layouts[r"True-.*"]@regexp
 python/test/gluon/test_lowerings.py::test_reduce_layouts[r".*-True-.*"]@regexp
diff --git a/scripts/skiplist/arl-s/gluon.txt b/scripts/skiplist/arl-s/gluon.txt
@@ -1,6 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5148
-python/test/gluon/test_lowerings.py::test_histogram[2048-2-src_layout3-dst_layout3]
-python/test/gluon/test_lowerings.py::test_histogram[32-32-src_layout4-dst_layout4]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/5149
 python/test/gluon/test_lowerings.py::test_scan_layouts[r"True-.*"]@regexp
 python/test/gluon/test_lowerings.py::test_reduce_layouts[r".*-True-.*"]@regexp
diff --git a/scripts/skiplist/default/gluon.txt b/scripts/skiplist/default/gluon.txt
@@ -1,6 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5148
-python/test/gluon/test_lowerings.py::test_histogram[2048-2-src_layout3-dst_layout3]
-python/test/gluon/test_lowerings.py::test_histogram[32-32-src_layout4-dst_layout4]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/5149
 python/test/gluon/test_lowerings.py::test_scan_layouts[r"True-.*"]@regexp
 python/test/gluon/test_lowerings.py::test_reduce_layouts[r".*-True-.*"]@regexp
diff --git a/scripts/skiplist/lts/gluon.txt b/scripts/skiplist/lts/gluon.txt
@@ -1,8 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5147
-python/test/gluon/test_core.py::test_2d_tensor_early_return
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5148
-python/test/gluon/test_lowerings.py::test_histogram[2048-2-src_layout3-dst_layout3]
-python/test/gluon/test_lowerings.py::test_histogram[32-32-src_layout4-dst_layout4]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/5149
 python/test/gluon/test_lowerings.py::test_scan_layouts[r"True-.*"]@regexp
 python/test/gluon/test_lowerings.py::test_reduce_layouts[r".*-True-.*"]@regexp
diff --git a/scripts/skiplist/mtl/gluon.txt b/scripts/skiplist/mtl/gluon.txt
@@ -1,6 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5148
-python/test/gluon/test_lowerings.py::test_histogram[2048-2-src_layout3-dst_layout3]
-python/test/gluon/test_lowerings.py::test_histogram[32-32-src_layout4-dst_layout4]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/5149
 python/test/gluon/test_lowerings.py::test_scan_layouts[r"True-.*"]@regexp
 python/test/gluon/test_lowerings.py::test_reduce_layouts[r".*-True-.*"]@regexp
diff --git a/scripts/skiplist/xe2/gluon.txt b/scripts/skiplist/xe2/gluon.txt
@@ -1,6 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5148
-python/test/gluon/test_lowerings.py::test_histogram[2048-2-src_layout3-dst_layout3]
-python/test/gluon/test_lowerings.py::test_histogram[32-32-src_layout4-dst_layout4]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/5149
 python/test/gluon/test_lowerings.py::test_scan_layouts[r"True-.*"]@regexp
 python/test/gluon/test_lowerings.py::test_reduce_layouts[r".*-True-.*"]@regexp
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp
@@ -2,6 +2,7 @@
 
 using namespace mlir;
 using namespace mlir::triton;
+using namespace mlir::triton::gpu;
 
 // Compute a histogram within a warp. This uses an algorithm by @apgoucher
 // that does the following:
@@ -20,9 +21,7 @@ static SmallVector<Value> computeWarpLevelHistogram(
   Value zero = b.i32_val(0);
   int numBits = llvm::Log2_64(numBins);
   int numBitsLaneId = llvm::Log2_64(numThreadPerWarp);
-  unsigned numElementsPerThreads = triton::gpu::getTotalElemsPerThread(srcType);
-  unsigned numThreadWithUniqueData = triton::gpu::getThreadsPerWarp(
-      srcType.getEncoding(), srcType.getShape())[0];
+  unsigned numElementsPerThreads = getTotalElemsPerThread(srcType);
   // The histogram is distributed across threads, each thread owns `numBins /
   // numThreadPerWarp` bins.
   SmallVector<Value> warpLevelHistogram(numBins / numThreadPerWarp, zero);
@@ -39,10 +38,6 @@ static SmallVector<Value> computeWarpLevelHistogram(
     uint64_t fullMaskValue = (1ll << numThreadPerWarp) - 1u;
     Value fullMask = b.int_val(numThreadPerWarp, fullMaskValue);
     Value mask = fullMask;
-    // If not all threads have unique data, mask out the redundant ones.
-    if (numThreadWithUniqueData < numThreadPerWarp) {
-      mask = b.int_val(numThreadPerWarp, (1ULL << numThreadWithUniqueData) - 1);
-    }
     for (int i = 0; i < numBitsLaneId; i++) {
       Value updateMask =
           b.select(b.icmp_ne(b.and_(threadId, b.i32_val(1 << i)), zero),
@@ -94,8 +89,6 @@ static SmallVector<Value> computeCrossWarpHistogram(
     Value threadId, int numWarps) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   SmallVector<Value> histogramValues;
-  unsigned numWarpsWithUniqueData = mlir::triton::gpu::getWarpsPerCTA(
-      srcType.getEncoding(), srcType.getShape())[0];
   Value laneId = b.and_(threadId, b.i32_val(numThreadPerWarp - 1));
   // Initialize the shared memory with zeros.
   int64_t numElementPerThread =
@@ -110,19 +103,6 @@ static SmallVector<Value> computeCrossWarpHistogram(
   }
   b.barrier();
   Block *afterAtomics = nullptr;
-  // If some warps have replicated data we need to skip those warps when
-  // accumulating.
-  if (numWarpsWithUniqueData < numWarps) {
-    Block *currentBlock = rewriter.getInsertionBlock();
-    afterAtomics =
-        rewriter.splitBlock(currentBlock, rewriter.getInsertionPoint());
-    Block *atomicBlock = rewriter.createBlock(afterAtomics);
-    rewriter.setInsertionPointToEnd(currentBlock);
-    Value cond = b.icmp_ult(
-        threadId, b.i32_val(numWarpsWithUniqueData * numThreadPerWarp));
-    rewriter.create<LLVM::CondBrOp>(loc, cond, atomicBlock, afterAtomics);
-    rewriter.setInsertionPointToStart(atomicBlock);
-  }
   // Apply atomic add to update the histogram in shared memory.
   for (int i = 0; i < warpLevelHistogram.size(); ++i) {
     Value warpLevelHistogramValue = warpLevelHistogram[i];
@@ -208,6 +188,24 @@ struct HistogramOpConversion
         loc, rewriter, srcType, baseSharedMemPtr, warpLevelHistogram, numBins,
         numThreadsPerWarp, innerDimIndices, threadId, numWarps);
 
+    // Depending on the layout, some threads may have duplicate data. We can
+    // account for this by calculating a "replication factor" and dividing the
+    // results by it to avoid overcounting.
+    auto replicationFactor = numWarps * numThreadsPerWarp;
+    auto threadsPerWarp = getThreadsPerWarp(srcType);
+    auto warpsPerCTA =
+        getWarpsPerCTA(srcType.getEncoding(), srcType.getShape());
+    replicationFactor /= std::accumulate(
+        threadsPerWarp.begin(), threadsPerWarp.end(), 1, std::multiplies<>());
+    replicationFactor /= std::accumulate(warpsPerCTA.begin(), warpsPerCTA.end(),
+                                         1, std::multiplies<>());
+
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+    for (auto i = 0; i < histogramValue.size(); ++i) {
+      histogramValue[i] =
+          b.sdiv(histogramValue[i], b.i32_val(replicationFactor));
+    }
+
     Value results = packLLElements(loc, typeConverter, histogramValue, rewriter,
                                    op.getType());
     rewriter.replaceOp(op, results);