[https://nvbugs/5720357][fix] Fix indice offset overflow in custom Top-K kernel and corresponding UT case (#10027)

longcheng-nv · chang-l · web-flow · commit b882393d697b · 2025-12-19T14:58:01.000-05:00
Signed-off-by: longcheng-nv &lt;243710427+longcheng-nv@users.noreply.github.com&gt;
Co-authored-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/indexerTopK.cu b/cpp/tensorrt_llm/kernels/indexerTopK.cu
@@ -606,8 +606,8 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
     int rowEnd = rowEnds[rowIdx];
 
     // Local pointers to this block
-    outIndices += rowIdx * topK;
-    logits += rowIdx * stride0;
+    outIndices += static_cast<int64_t>(rowIdx) * topK;
+    logits += static_cast<int64_t>(rowIdx) * stride0;
 
     topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
         nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
@@ -638,23 +638,23 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(f
     // Local pointers to this block
     if constexpr (!multipleBlocksPerRow && !mergeBlocks)
     {
-        outIndices += rowIdx * topK;
+        outIndices += static_cast<int64_t>(rowIdx) * topK;
     }
     else if constexpr (multipleBlocksPerRow)
     {
         auto const blockSize = rowEnd / gridDim.y; // 16384 / 2 = 8192
         rowStart = blockSize * blockIdx.y;         // 8192 * 1 = 8192
         rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
-        outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
-        outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
+        outIndices += static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
+        outLogits += static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
     }
     else if constexpr (mergeBlocks)
     {
         rowEnd = numBlocksToMerge * topK;
-        indices += rowIdx * numBlocksToMerge * topK;
-        outIndices += rowIdx * topK;
+        indices += static_cast<int64_t>(rowIdx) * numBlocksToMerge * topK;
+        outIndices += static_cast<int64_t>(rowIdx) * topK;
     }
-    logits += rowIdx * stride0;
+    logits += static_cast<int64_t>(rowIdx) * stride0;
 
     topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort, multipleBlocksPerRow, mergeBlocks>(
         indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK);
diff --git a/tests/unittest/_torch/thop/parallel/test_indexer_topk.py b/tests/unittest/_torch/thop/parallel/test_indexer_topk.py
@@ -1,9 +1,31 @@
 import pytest
 import torch
+from utils.util import getSMVersion, skip_pre_hopper
 
 # Import tensorrt_llm to load custom CUDA operators (indexer_topk_decode, indexer_topk_prefill)
 import tensorrt_llm  # noqa: F401
 
+if not torch.cuda.is_available():
+    pytest.skip("CUDA is required for indexer_topk tests", allow_module_level=True)
+
+
+def _prefill_param_values():
+    """
+    Decide parameter coverage based on GPU architecture (SM version).
+
+    - pre-Hopper (SM < 90): skip via @skip_pre_hopper
+    - Hopper (SM == 90): reduced coverage
+    - Blackwell (SM >= 100): full coverage
+    """
+    sm = getSMVersion()
+    if sm >= 100:  # Blackwell family
+        return [1, 32], [4096, 8192, 32768]
+    # Hopper (and other >= 90 but < 100, if any): reduced coverage
+    return [1, 4], [4096, 8192, 32768]
+
+
+_PREFILL_BATCH_SIZES, _PREFILL_NUM_TOKENS = _prefill_param_values()
+
 
 def create_random_logits(
     row_starts: torch.Tensor,
@@ -197,27 +219,38 @@ def test_indexer_topk_decode(batch_size, next_n, index_topk, num_tokens):
     ), "CUDA top_k_per_row results don't match torch.topk"
 
 
-@pytest.mark.parametrize("batch_size", [1, 512, 2048])
+@skip_pre_hopper
+@pytest.mark.parametrize("batch_size", _PREFILL_BATCH_SIZES)
 @pytest.mark.parametrize("index_topk", [2048, 128])
-@pytest.mark.parametrize("num_tokens", [4096, 8192])
+@pytest.mark.parametrize("num_tokens", _PREFILL_NUM_TOKENS)
 def test_indexer_topk_prefill(batch_size, index_topk, num_tokens):
     torch.manual_seed(24)
     torch.cuda.manual_seed(24)
 
-    # Set input data
-    row_starts = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
-    row_ends = torch.arange(1, batch_size + 1, device="cuda", dtype=torch.int32)
+    # gen random input for the sequence length
+    seq_lens = generate_seq_lens(batch_size, index_topk, num_tokens)
+    num_gen_tokens = seq_lens.sum()
+
+    # gen the row_starts and row_ends (from 1 to ...)
+    row_starts = torch.zeros(num_gen_tokens, dtype=torch.int32, device="cuda")
+    row_indices = torch.arange(1, seq_lens.max() + 1, dtype=torch.int32, device="cuda")
+    row_ends = row_indices.expand(seq_lens.size(0), -1)[
+        row_indices.expand(seq_lens.size(0), -1) <= seq_lens.unsqueeze(1)
+    ].contiguous()
 
+    # gen logits
     logits = create_random_logits(row_starts, row_ends, torch.float32, 42)
 
     # Create output tensors
-    indices = torch.empty((batch_size, index_topk), dtype=torch.int32, device="cuda")
+    indices = torch.empty((num_gen_tokens, index_topk), dtype=torch.int32, device="cuda")
 
     # Run CUDA implementation
     torch.ops.trtllm.indexer_topk_prefill(logits, row_starts, row_ends, indices, index_topk)
+    torch.cuda.synchronize()
 
     # Run reference implementation
-    torch_indices = logits.topk(min(index_topk, max(row_ends)), dim=-1)[1]
+    max_row_len = row_ends.max().item()
+    torch_indices = logits.topk(min(index_topk, max_row_len), dim=-1)[1]
     mask_lo = torch_indices >= 0
     mask_hi = (torch_indices - (row_ends - row_starts)[:, None]) < 0
     mask = mask_lo & mask_hi