[Cherry-pick][RESOLVED] Fix histograms for complex replicated layouts (#7938) (#546)

saagarjha · meta-codesync[bot] · commit 234cd11a69d7 · 2025-10-30T20:07:32.000-07:00
Summary: ⚠️ **MERGE CONFLICTS DETECTED** ⚠️ This cherry-pick contains merge conflicts that require manual resolution. Original Commit: 078954b Original Author: Saagar Jha Original Date: 2025-08-29 05:05:37 -0700 **Action Required:** 1. Check out this branch locally 2. Resolve the merge conflicts in the affected files 3. Commit the resolved changes 4. Update this PR Original commit message: ``` Fix histograms for complex replicated layouts (#7938) The current histogram code assumes that replication across a warp is done in a way that involves the first n threads having unique data. This is not a valid assumption; in fact the function it calls to get this layout, getThreadsPerWarp, describes one such layout and how it's returned, so the histogram code actually discards that information. To fix this, we actually remove the uniquing code that masks out threads possessing duplicate data. Instead we have everyone participate and adjust for the overcounting that results by computing the "replication factor". This is much easier than computing the correct mask, which is nontrivial in the general case. ``` This PR was automatically cherry-picked from the upstream triton-lang/triton repository. The conflicts have been committed with conflict markers for easier resolution. Pull Request resolved: #546 Reviewed By: agron911 Differential Revision: D85907975 Pulled By: dshi7 fbshipit-source-id: 218021919c1205249fe7a6783a0a186e91a56411
diff --git a/lib/Conversion/TritonGPUToLLVM/HistogramOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/HistogramOpToLLVM.cpp
@@ -25,7 +25,6 @@ static SmallVector<Value> computeWarpLevelHistogram(
   int numBits = llvm::Log2_64(numBins);
   int numBitsLaneId = llvm::Log2_64(numThreadPerWarp);
   unsigned numElementsPerThreads = getTotalElemsPerThread(srcType);
-  unsigned numThreadWithUniqueData = getThreadsPerWarp(srcType)[0];
   // The histogram is distributed across threads, each thread owns `numBins /
   // numThreadPerWarp` bins.
   SmallVector<Value> warpLevelHistogram(numBins / numThreadPerWarp, zero);
@@ -43,10 +42,6 @@ static SmallVector<Value> computeWarpLevelHistogram(
         numThreadPerWarp == 32 ? 0xFFFFFFFF : 0xFFFFFFFFFFFFFFFF;
     Value fullMask = b.int_val(numThreadPerWarp, fullMaskValue);
     Value mask = fullMask;
-    // If not all threads have unique data, mask out the redundant ones.
-    if (numThreadWithUniqueData < numThreadPerWarp) {
-      mask = b.int_val(numThreadPerWarp, (1ULL << numThreadWithUniqueData) - 1);
-    }
     for (int i = 0; i < numBitsLaneId; i++) {
       Value updateMask =
           b.select(b.icmp_ne(b.and_(threadId, b.i32_val(1 << i)), zero),
@@ -96,8 +91,6 @@ static SmallVector<Value> computeCrossWarpHistogram(
     Value threadId, int numWarps) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   SmallVector<Value> histogramValues;
-  unsigned numWarpsWithUniqueData = mlir::triton::gpu::getWarpsPerCTA(
-      srcType.getEncoding(), srcType.getShape())[0];
   Value laneId = b.and_(threadId, b.i32_val(numThreadPerWarp - 1));
   // Initialize the shared memory with zeros.
   int64_t numElementPerThread =
@@ -112,19 +105,6 @@ static SmallVector<Value> computeCrossWarpHistogram(
   }
   b.barrier();
   Block *afterAtomics = nullptr;
-  // If some warps have replicated data we need to skip those warps when
-  // accumulating.
-  if (numWarpsWithUniqueData < numWarps) {
-    Block *currentBlock = rewriter.getInsertionBlock();
-    afterAtomics =
-        rewriter.splitBlock(currentBlock, rewriter.getInsertionPoint());
-    Block *atomicBlock = rewriter.createBlock(afterAtomics);
-    rewriter.setInsertionPointToEnd(currentBlock);
-    Value cond = b.icmp_ult(
-        threadId, b.i32_val(numWarpsWithUniqueData * numThreadPerWarp));
-    rewriter.create<LLVM::CondBrOp>(loc, cond, atomicBlock, afterAtomics);
-    rewriter.setInsertionPointToStart(atomicBlock);
-  }
   // Apply atomic add to update the histogram in shared memory.
   for (int i = 0; i < warpLevelHistogram.size(); ++i) {
     Value warpLevelHistogramValue = warpLevelHistogram[i];
@@ -209,6 +189,24 @@ struct HistogramOpConversion
         loc, rewriter, srcType, baseSharedMemPtr, warpLevelHistogram, numBins,
         numThreadsPerWarp, innerDimIndices, threadId, numWarps);
 
+    // Depending on the layout, some threads may have duplicate data. We can
+    // account for this by calculating a "replication factor" and dividing the
+    // results by it to avoid overcounting.
+    auto replicationFactor = numWarps * numThreadsPerWarp;
+    auto threadsPerWarp = getThreadsPerWarp(srcType);
+    auto warpsPerCTA =
+        getWarpsPerCTA(srcType.getEncoding(), srcType.getShape());
+    replicationFactor /= std::accumulate(
+        threadsPerWarp.begin(), threadsPerWarp.end(), 1, std::multiplies<>());
+    replicationFactor /= std::accumulate(warpsPerCTA.begin(), warpsPerCTA.end(),
+                                         1, std::multiplies<>());
+
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+    for (auto i = 0; i < histogramValue.size(); ++i) {
+      histogramValue[i] =
+          b.sdiv(histogramValue[i], b.i32_val(replicationFactor));
+    }
+
     Value results = packLLElements(loc, typeConverter, histogramValue, rewriter,
                                    op.getType());
     rewriter.replaceOp(op, results);
diff --git a/python/test/gluon/test_lowerings.py b/python/test/gluon/test_lowerings.py
@@ -240,6 +240,40 @@ def kernel(x_ptr, y_ptr, M: ttgl.constexpr, layout: ttgl.constexpr):
 ])
 
 
+@pytest.mark.parametrize("M, bins", [[2048, 2], [8, 512], [32, 32]])
+@pytest.mark.parametrize("src_layout", [ttgl.BlockedLayout([1], [THREADS_PER_WARP], [4], [0]), "linear_layout"])
+@pytest.mark.parametrize("dst_layout", [ttgl.BlockedLayout([1], [THREADS_PER_WARP], [4], [0])])
+def test_histogram(M, bins, src_layout, dst_layout, device):
+
+    @gluon.jit
+    def kernel(x_ptr, z_ptr, M: ttgl.constexpr, B: ttgl.constexpr, src_layout: ttgl.constexpr,
+               dst_layout: ttgl.constexpr):
+        offs = ttgl.arange(0, M, layout=src_layout)
+        x = ttgl.load(x_ptr + offs)
+        h = ttgl.histogram(x, B, layout=dst_layout)
+        z_offs = ttgl.arange(0, B, layout=dst_layout)
+        ttgl.store(z_ptr + z_offs, h)
+
+    if src_layout == "linear_layout":
+        if M == 32:
+            src_layout = ttgl.DistributedLinearLayout(
+                reg_bases=[],
+                lane_bases=[[0], [16], [4], [2], [1]] + [[0]] * (THREADS_PER_WARP >> 6),
+                warp_bases=[[0], [8]],
+                block_bases=[],
+                shape=(M, ),
+            )
+        else:
+            pytest.skip("Linear layout is specialized for 32 elements")
+
+    torch.manual_seed(0)
+    x = torch.randint(0, bins, (M, ), dtype=torch.int32, device=device)
+    z = torch.zeros((bins, ), dtype=torch.int32, device=device)
+    z_torch = torch.histc(x.float(), bins=bins, min=0, max=bins - 1).to(torch.int32)
+    kernel[(1, )](x, z, M, bins, src_layout, dst_layout, num_warps=4)
+    torch.testing.assert_close(z, z_torch, atol=0, rtol=0)
+
+
 @pytest.mark.parametrize("M", [64, 128, 256])
 @pytest.mark.parametrize("src_layout", _1d_layouts)
 @pytest.mark.parametrize("dst_layout", _1d_layouts)
diff --git a/python/triton/experimental/gluon/language/_layouts.py b/python/triton/experimental/gluon/language/_layouts.py
@@ -187,10 +187,10 @@ def mangle(self):
 
     def __hash__(self):
         return hash((
-            tuple(self.reg_bases),
-            tuple(self.lane_bases),
-            tuple(self.warp_bases),
-            tuple(self.block_bases),
+            tuple(map(tuple, self.reg_bases)),
+            tuple(map(tuple, self.lane_bases)),
+            tuple(map(tuple, self.warp_bases)),
+            tuple(map(tuple, self.block_bases)),
             tuple(self.shape),
         ))