NVIDIA · mikeiovine · Oct 21, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1073,6 +1073,7 @@ common-files: &common_files |
         tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py |
         tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py |
         tests/unittest/_torch/thop/serial/test_moe_alltoall.py |
+        tests/unittest/_torch/thop/serial/test_moe.py |
         tests/unittest/api_stability/api_stability_core.py |
         tests/unittest/api_stability/test_llm_api.py |
         tests/unittest/bindings/binding_test_utils.py |

diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -2142,7 +2142,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(
             return 0;
         }
 
-        auto const numCurrTokens = mSequences.at(req.mRequestId).getNumTokens();
+        auto const numCurrTokens = getSequence(req.mRequestId).getNumTokens();
         auto const generatedTokens = numCurrTokens - req.getPromptLen();
         auto const maxTokensToAddToKVCache = req.mMaxNewTokens - generatedTokens;
         auto const tokensPerStep = req.getNumDraftTokens() + 1;
@@ -2406,7 +2406,13 @@ void KVCacheManager::addSequence(
 void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest)
 {
     auto const requestId = llmRequest.mRequestId;
-    if (mSequences.find(requestId) != mSequences.end())
+    bool found = false;
+    {
+        // protect the mSequences
+        std::scoped_lock lock(mSequencesMtx);
+        found = mSequences.find(requestId) != mSequences.end();
+    }
+    if (found)
     {
         auto& sequence = getSequence(requestId);
         if (mEnableBlockReuse && !llmRequest.isDummyRequest())

diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu
@@ -117,8 +117,8 @@ __device__ struct __attribute__((aligned(32))) LamportFlags
         {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
             asm volatile("red.async.release.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
-#elif (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-            asm volatile("red.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
+#elif (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
+            asm volatile("red.release.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
 #else
             atomicAdd(offset_access_ptr, 1);
 #endif

@@ -2166,6 +2166,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
         def noIsolateTests = false
         def rerunFailed = false
 
+        echoNodeAndGpuInfo(pipeline, stageName)
+        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
+
+        def extraInternalEnv = ""
+        def pytestTestTimeout = "3600"
+
+        // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
+        extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
+        // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
+        extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
+        // Enable NCCL debug information for multi-GPU tests
+        extraInternalEnv += " NCCL_DEBUG=INFO"
+
         def testDBList = renderTestDB(testList, llmSrc, stageName)
 
         // Process shard test list and create separate files for regular and isolate tests

@@ -1113,6 +1113,7 @@ exclude = [
     "tests/unittest/_torch/thop/parallel/test_weight_only_quant_gemm.py",
     "tests/unittest/_torch/thop/parallel/test_weight_only_quant_linear.py",
     "tests/unittest/_torch/thop/serial/test_moe_alltoall.py",
+    "tests/unittest/_torch/thop/serial/test_moe.py",
     "tests/unittest/api_stability/api_stability_core.py",
     "tests/unittest/api_stability/test_llm_api.py",
     "tests/unittest/bindings/binding_test_utils.py",

@@ -565,6 +565,7 @@ def is_nvfp4_output_kernel_available(
 @dataclass(kw_only=True)
 class TrtllmAttentionMetadata(AttentionMetadata):
     workspace: Optional[torch.Tensor] = None
+    cuda_graph_workspace: Optional[torch.Tensor] = None
 
     # TrtllmAttention needs to know the beam width to access to the cache indirection buffer,
     # when beam search is enabled.
@@ -680,6 +681,14 @@ def _post_init_with_buffers(self, buffers) -> None:
                 device='cuda',
                 dtype=torch.int8,
             )
+
+        if self.cuda_graph_workspace is None:
+            self.cuda_graph_workspace = torch.empty(
+                (0, ),
+                device='cuda',
+                dtype=torch.int8,
+            )
+
         if self.kv_cache_manager is not None:
             self.kv_cache_block_offsets = self.get_empty(
                 buffers,
@@ -1317,8 +1326,9 @@ def forward(
             host_kv_cache_pool_pointers=metadata.host_kv_cache_pool_pointers,
             host_kv_cache_pool_mapping=metadata.host_kv_cache_pool_mapping,
             block_ids_per_seq=metadata.block_ids_per_seq,
-            workspace=metadata.
-            workspace,  # re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue.
+            # re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue.
+            workspace=metadata.workspace
+            if not metadata.is_cuda_graph else metadata.cuda_graph_workspace,
             cache_indirection=metadata.cache_indirection,
             kv_scale_orig_quant=self.kv_scale_orig_quant,
             kv_scale_quant_orig=self.kv_scale_quant_orig,

@@ -51,9 +51,7 @@ def __init__(
         self.capture_num_tokens = sorted(capture_num_tokens or [])
         self.piecewise_cuda_graph = enable_piecewise_cuda_graph
         self.no_optimization = False
-        # We only need to create aux streams.
-        self.aux_streams = Backend.Streams(
-            [torch.cuda.Stream() for _ in range(max_num_streams - 1)])
+        self.num_streams = max_num_streams
         self.events = Backend.Events()
         inductor_config.enable_auto_functionalized_v2 = False
 
@@ -109,10 +107,8 @@ def optimize(
         # Do not apply multi-stream if enable piecewise cuda graph or inductor
         # For piecewise cuda graph, we will apply the multi-stream optimization in piecewise_optimizer
         # For inductor, we do not control the passes inside inductor.
-        if len(
-                self.aux_streams
-        ) > 0 and not self.piecewise_cuda_graph and not self.enable_inductor:
-            num_events = multi_stream_schedule(gm, len(self.aux_streams) + 1)
+        if self.num_streams > 1 and not self.piecewise_cuda_graph and not self.enable_inductor:
+            num_events = multi_stream_schedule(gm, self.num_streams)
             self.generate_events(num_events)
 
         gm.recompile()
@@ -125,7 +121,7 @@ def optimize(
                 self.input_num_tokens,
                 self.capture_num_tokens,
                 self._graph_pool_handle,
-                len(self.aux_streams) + 1,
+                self.num_streams,
             )
             self.generate_events(num_events)
             return gm

@@ -405,8 +405,8 @@ def tp_broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024):
     def pp_allgather(self, obj):
         return self.pp_comm.allgather(obj)
 
-    def pp_gather(self, obj):
-        return self.pp_comm.gather(obj)
+    def pp_gather(self, obj, root=0):
+        return self.pp_comm.gather(obj, root=root)
 
     def pp_broadcast(self, obj, root=0):
         return self.pp_comm.bcast(obj, root)

@@ -5,6 +5,7 @@
 
 from tensorrt_llm._torch.modules.qk_norm_attention import QKNormRoPEAttention
 from tensorrt_llm.functional import PositionEmbeddingType
+from tensorrt_llm.quantization import QuantAlgo
 
 from ..attention_backend import AttentionMetadata
 from ..attention_backend.interface import (PositionalEmbeddingParams,
@@ -54,7 +55,8 @@ class Exaone4Attention(QKNormRoPEAttention):
     def __init__(self,
                  model_config: ModelConfig[Exaone4Config],
                  layer_idx: Optional[int] = None,
-                 fuse_qk_norm_rope: bool = False):
+                 fuse_qk_norm_rope: bool = False,
+                 disable_deep_gemm: bool = False):
         config = model_config.pretrained_config
 
         self.attention_window_size = None
@@ -88,6 +90,7 @@ def __init__(self,
             layer_idx=layer_idx,
             dtype=config.torch_dtype,
             config=model_config,
+            disable_deep_gemm=disable_deep_gemm,
         )
 
     def forward(
@@ -128,9 +131,17 @@ def __init__(
         self.is_quanted = model_config.quant_config and model_config.quant_config.quant_mode.has_any_quant(
         )
 
+        disable_deep_gemm = False
+        quant_config = getattr(model_config, "quant_config", None)
+        if quant_config is not None:
+            # EXAONE4 fp8 has an illegal memory access issue with deep_gemm.
+            disable_deep_gemm = getattr(quant_config, "quant_algo",
+                                        None) == QuantAlgo.FP8_BLOCK_SCALES
+
         self.self_attn = Exaone4Attention(
             model_config,
             layer_idx=layer_idx,
+            disable_deep_gemm=disable_deep_gemm,
         )
 
         self.mlp = GatedMLP(
@@ -140,6 +151,7 @@ def __init__(
             dtype=config.torch_dtype,
             config=model_config,
             layer_idx=layer_idx,
+            disable_deep_gemm=disable_deep_gemm,
         )
 
         self.post_attention_layernorm = RMSNorm(hidden_size=config.hidden_size,

@@ -599,7 +599,7 @@ def forward(
                         ))
 
                 # Unpack the allreduce output
-                if self.next_attn is not None and self.is_nvfp4:
+                if self.post_feed_forward_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
                     act_fp4, act_sf, residual = allreduce_output
                     hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
                 else:
@@ -790,7 +790,7 @@ def forward(
                         scale=scale,
                         eps=self.next_layer_layernorm.variance_epsilon,
                     ))
-                if self.next_attn is not None and self.is_nvfp4:
+                if self.post_mlp_fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
                     act_fp4, act_sf, residual = all_reduce_output
                     hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
                 else:

@@ -270,6 +270,11 @@ def __init__(
                 use_ub = not use_ub_for_nccl and (
                     torch_compile_enable_userbuffers
                     and self._init_userbuffers(self.model.config.hidden_size))
+                self.backend_num_streams = Backend.Streams([
+                    torch.cuda.Stream() for _ in
+                    range(pytorch_backend_config.torch_compile_max_num_streams -
+                          1)
+                ])
                 self._torch_compile_backend = Backend(
                     torch_compile_inductor_enabled,
                     enable_userbuffers=use_ub,
@@ -2385,8 +2390,7 @@ def model_forward(self, **kwargs):
         if self._torch_compile_backend is not None:
             # Register aux streams and events to model extra attrs.
             # The streams and events are list which could be updated during compilation.
-            attrs["aux_streams"] = weakref.ref(
-                self._torch_compile_backend.aux_streams)
+            attrs["aux_streams"] = weakref.ref(self.backend_num_streams)
             attrs["events"] = weakref.ref(self._torch_compile_backend.events)
             attrs["global_stream"] = torch.cuda.current_stream()