[None][fix] fix accuracy issue(cherry-pick #11157 and #9530) (#11222)

bo-nv · syuoni · web-flow · commit 910c070e8836 · 2026-02-04T11:24:21.000+08:00
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
Signed-off-by: Bo Deng &lt;deemod@nvidia.com&gt;
Co-authored-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu
@@ -120,6 +120,11 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues
     auto warp = cg::tiled_partition<WARP_SIZE>(block);
 
     BaseType minScore = BaseType{-INFINITY};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    cudaGridDependencySynchronize();
+#endif
+
     for (uint32_t tokenId = warpIdx; tokenId < numTokens; tokenId += warpNum)
     {
         auto scoreOffset = tokenId * numExperts;
@@ -168,6 +173,10 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues
             }
         }
     } // end for tokenId
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 int nextPowerOfTwo(int num)
diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py
@@ -39,6 +39,8 @@
 
 # Use TinyGEMM when the number of tokens is not larger than this threshold
 MIN_LATENCY_TINYGEMM_NUM_TOKENS = 128
+# Enable TinyGEMM optimization (disabled by default, set ENABLE_TINYGEMM=1 to enable)
+ENABLE_TINYGEMM = os.environ.get('ENABLE_TINYGEMM', '0') == '1'
 
 
 class AttentionBlock(Attention):
@@ -226,7 +228,7 @@ def _create_ideal_expert_load_balanced_logits(
             dtype=pretrained_config.torch_dtype)
 
     def compute_gate_output(self, x: torch.Tensor) -> torch.Tensor:
-        if get_sm_version() in [
+        if ENABLE_TINYGEMM and get_sm_version() in [
                 90, 100, 103
         ] and x.shape[0] <= MIN_LATENCY_TINYGEMM_NUM_TOKENS:
             weight = self.gate.weight
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1530,7 +1530,8 @@ def previous_seq_slots_device():
         num_draft_tokens = len(draft_tokens)
         total_num_tokens = len(position_ids)
         assert total_num_tokens <= self.max_num_tokens, (
-            "total_num_tokens should be less than or equal to max_num_tokens")
+            f"total_num_tokens ({total_num_tokens}) should be less than or equal to max_num_tokens ({self.max_num_tokens})"
+        )
         # if exist requests that do not have previous batch, copy input_ids and draft_tokens
         if num_tokens > 0:
             input_ids = torch.tensor(input_ids,

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,11 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues`
`120`	`120`	`auto warp = cg::tiled_partition<WARP_SIZE>(block);`
`121`	`121`
`122`	`122`	`BaseType minScore = BaseType{-INFINITY};`
	`123`	`+`
	`124`	`+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)`
	`125`	`+ cudaGridDependencySynchronize();`
	`126`	`+#endif`
	`127`	`+`
`123`	`128`	`for (uint32_t tokenId = warpIdx; tokenId < numTokens; tokenId += warpNum)`
`124`	`129`	`{`
`125`	`130`	`auto scoreOffset = tokenId * numExperts;`
`@@ -168,6 +173,10 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues`
`168`	`173`	`}`
`169`	`174`	`}`
`170`	`175`	`} // end for tokenId`
	`176`	`+`
	`177`	`+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)`
	`178`	`+ cudaTriggerProgrammaticLaunchCompletion();`
	`179`	`+#endif`
`171`	`180`	`}`
`172`	`181`
`173`	`182`	`int nextPowerOfTwo(int num)`