doc: Update doc for Deepseek min latency (NVIDIA#3717)

zongfeijing · web-flow · commit 7eee9a9d2872 · 2025-04-22T23:07:59.000+08:00
* Tidy code

Signed-off-by: Zongfei Jing &lt;20381269+zongfeijing@users.noreply.github.com&gt;

* Update doc for min latency deepseek

Signed-off-by: Zongfei Jing &lt;20381269+zongfeijing@users.noreply.github.com&gt;

* Throw exception for RouterKernel when not running on sm90+

Signed-off-by: Zongfei Jing &lt;20381269+zongfeijing@users.noreply.github.com&gt;

---------

Signed-off-by: Zongfei Jing &lt;20381269+zongfeijing@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/RoutingKernel.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/RoutingKernel.cu
@@ -698,6 +698,7 @@ __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(Nu
 #else
 __global__ void routingIndicesClusterKernel(KernelParams params)
 {
+    assert(false && "routingIndicesClusterKernel is only supported on SM90+ architectures");
 }
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -886,6 +887,8 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesCoopKernel(KernelPar
             params.mPtrPermutedIdxToTokenIdx[permutedIdx] = tokenIdx;
         }
     }
+#else
+    assert(false && "routingIndicesCoopKernel is only supported on SM90+ architectures");
 #endif
 }
 
@@ -973,6 +976,8 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesHistogramKernel(Kern
     // Reduce histograms with atomics.
     int32_t const localExpertCount = smemExpertCount[threadIdx.x];
     atomicAdd(&params.mPtrExpertCounts[threadIdx.x], localExpertCount);
+#else
+    assert(false && "routingIndicesHistogramKernel is only supported on SM90+ architectures");
 #endif
 }
 
@@ -1204,6 +1209,8 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesOffsetsKernel(Kernel
     {
         cudaTriggerProgrammaticLaunchCompletion();
     }
+#else
+    assert(false && "routingIndicesOffsetsKernel is only supported on SM90+ architectures");
 #endif
 }
 
diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
@@ -110,6 +110,7 @@ cat >./extra-llm-api-config.yml<<EOF
 pytorch_backend_config:
     enable_overlap_scheduler: true
     use_cuda_graph: true
+    moe_backend: TRTLLM
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -125,7 +126,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
     --concurrency 1 \
     --max_batch_size 1 \
     --tp 8 \
-    --ep 4 \
+    --ep 2 \
     --extra_llm_api_options ./extra-llm-api-config.yml
 ```
 
@@ -147,12 +148,13 @@ The perf can be different when using different datasets and different machines.
 ===========================================================
 = PERFORMANCE OVERVIEW
 ===========================================================
-Request Throughput (req/sec):                     0.1244
-Total Output Throughput (tokens/sec):             254.5535
-Per User Output Throughput (tokens/sec/user):     254.7634
-Per GPU Output Throughput (tokens/sec/gpu):       31.8192
-Total Latency (ms):                               80368.1616
-Average request latency (ms):                     8036.7546
+Request Throughput (req/sec):                     0.1341
+Total Output Throughput (tokens/sec):             274.4168
+Per User Output Throughput (tokens/sec/user):     274.7188
+Per GPU Output Throughput (tokens/sec/gpu):       34.3021
+Total Token Throughput (tokens/sec):              414.0461
+Total Latency (ms):                               74561.7520
+Average request latency (ms):                     7456.1219
 ```
 
 ### B200 max-throughput
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -624,6 +624,9 @@ def _compute_mlp_tp_size(self, intermediate_size: int,
             )
         return mlp_tp_size
 
+    def _enable_latency_mode(self, num_tokens: int):
+        return num_tokens <= 128 and self.fusion_config.POST_MOE_FUSION and self.is_nvfp4 and self.model_config.moe_backend == 'CUTLASS'
+
     def forward(
         self,
         position_ids: torch.LongTensor,
@@ -650,9 +653,7 @@ def forward(
         using_prev_fusion = self.deepseek_allreduce_disabled or hidden_states.size(
             0) > 128
 
-        min_latency_mode = True if hidden_states.size(
-            0
-        ) <= 128 and self.fusion_config.POST_MOE_FUSION and self.is_nvfp4 and self.model_config.moe_backend == 'CUTLASS' else False
+        min_latency_mode = self._enable_latency_mode(hidden_states.size(0))
 
         if self.fusion_config.PRE_MOE_FUSION:
             # Custom AR Fusion for DeepseekV3

Original file line number	Diff line number	Diff line change
`@@ -698,6 +698,7 @@ __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(Nu`
`698`	`698`	`#else`
`699`	`699`	`__global__ void routingIndicesClusterKernel(KernelParams params)`
`700`	`700`	`{`
	`701`	`+ assert(false && "routingIndicesClusterKernel is only supported on SM90+ architectures");`
`701`	`702`	`}`
`702`	`703`	`#endif`
`703`	`704`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`@@ -886,6 +887,8 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesCoopKernel(KernelPar`
`886`	`887`	`params.mPtrPermutedIdxToTokenIdx[permutedIdx] = tokenIdx;`
`887`	`888`	`}`
`888`	`889`	`}`
	`890`	`+#else`
	`891`	`+ assert(false && "routingIndicesCoopKernel is only supported on SM90+ architectures");`
`889`	`892`	`#endif`
`890`	`893`	`}`
`891`	`894`
`@@ -973,6 +976,8 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesHistogramKernel(Kern`
`973`	`976`	`// Reduce histograms with atomics.`
`974`	`977`	`int32_t const localExpertCount = smemExpertCount[threadIdx.x];`
`975`	`978`	`atomicAdd(&params.mPtrExpertCounts[threadIdx.x], localExpertCount);`
	`979`	`+#else`
	`980`	`+ assert(false && "routingIndicesHistogramKernel is only supported on SM90+ architectures");`
`976`	`981`	`#endif`
`977`	`982`	`}`
`978`	`983`
`@@ -1204,6 +1209,8 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesOffsetsKernel(Kernel`
`1204`	`1209`	`{`
`1205`	`1210`	`cudaTriggerProgrammaticLaunchCompletion();`
`1206`	`1211`	`}`
	`1212`	`+#else`
	`1213`	`+ assert(false && "routingIndicesOffsetsKernel is only supported on SM90+ architectures");`
`1207`	`1214`	`#endif`
`1208`	`1215`	`}`
`1209`	`1216`