lancelly
diff --git a/‎.github/workflows/blossom-ci.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/blossom-ci.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 7 additions & 1 deletion b/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/models/modeling_nemotron_h.py‎
Lines changed: 117 additions & 24 deletions b/‎tensorrt_llm/_torch/models/modeling_nemotron_h.py‎
Lines changed: 117 additions & 24 deletions
diff --git a/‎tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py‎
Lines changed: 7 additions & 2 deletions b/‎tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 11 additions & 4 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 11 additions & 4 deletions
@@ -191,6 +191,7 @@ jobs:
         "litaotju",
         "liyuhannnnn",
         "lkomali",
+        "longcheng-nv",
         "longlee0622",
         "lowsfer",
         "lucaslie",
@@ -293,6 +294,7 @@ jobs:
         "tcherckez-nvidia",
         "thorjohnsen",
         "tianyuxbear",
+        "tianyuz-nv",
         "tiffany940107",
         "tijyojwad",
         "timlee0212",
@@ -332,11 +334,13 @@ jobs:
         "xueweilnvidia",
         "xupinjie",
         "xuwchen",
+        "xwang233",
         "xxi-nv",
         "yali-arch",
         "yechank-nvidia",
         "yibinl-nvidia",
         "yifeizhang-c",
+        "YihuiLu512",
         "yihwang-nv",
         "yijingl-nvidia",
         "yilin-void",
 
@@ -68,7 +68,7 @@ std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optiona
 class PyKvCacheManager : public tbk::BaseKVCacheManager
 {
 public:
-    NB_TRAMPOLINE(tbk::BaseKVCacheManager, 30);
+    NB_TRAMPOLINE(tbk::BaseKVCacheManager, 36);
 
     // using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors
     void allocatePools(bool useUvm = false) override
@@ -255,6 +255,12 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
     {
         NB_OVERRIDE_PURE(flushIterationEvents);
     }
+
+    SizeType32 countReusableBlocks(VecUniqueTokens const& uniqueTokens, tb::LlmRequest const& llmRequest,
+        bool onlyAllocated = false) const override
+    {
+        NB_OVERRIDE_PURE(countReusableBlocks, uniqueTokens, llmRequest, onlyAllocated);
+    }
 };
 
 // TODO: Deduplicate executor bindings KvCacheStats
 
@@ -31,7 +31,7 @@
 from tensorrt_llm.logger import logger
 
 from ..attention_backend import AttentionMetadata
-from ..distributed import AllReduce
+from ..distributed import AllReduce, AllReduceFusionOp, AllReduceParams
 from ..model_config import ModelConfig
 from ..modules.attention import Attention
 from ..modules.decoder_layer import DecoderLayer
@@ -59,6 +59,7 @@ def __init__(
         self,
         model_config: ModelConfig[NemotronHConfig],
         layer_idx: int,
+        reduce_output: bool = True,
     ):
         config = model_config.pretrained_config
         if isinstance(config.intermediate_size, list):
@@ -76,6 +77,7 @@ def __init__(
             activation=relu2,
             dtype=config.torch_dtype,
             config=model_config,
+            reduce_output=reduce_output,
         )
         self.layer_idx = layer_idx
 
@@ -119,7 +121,8 @@ def forward(
     ) -> torch.Tensor:
         return super().forward(position_ids=None,
                                hidden_states=hidden_states,
-                               attn_metadata=attn_metadata)
+                               attn_metadata=attn_metadata,
+                               **kwargs)
 
 
 # Ref code: https://huggingface.co/nvidia/Nemotron-Nano-3-30B-A3.5B-dev-1024/blob/main/modeling_nemotron_h.py#L818
@@ -130,6 +133,7 @@ def __init__(
         model_config: ModelConfig[PretrainedConfig],
         layer_idx: int,
         aux_stream_dict: dict[AuxStreamType, torch.cuda.Stream],
+        reduce_output: bool = False,
     ):
         super().__init__()
 
@@ -226,8 +230,7 @@ def __init__(
             activation_type=self.activation_type,
         )
 
-        if not model_config.mapping.enable_attention_dp:
-            # AllReduce for combining shared and routed expert outputs in multi-GPU settings.
+        if reduce_output:
             self.allreduce = AllReduce(
                 mapping=model_config.mapping,
                 strategy=model_config.allreduce_strategy,
@@ -324,8 +327,10 @@ def _compute_routed_output():
         final_hidden_states = shared_output + routed_output
 
         # Perform all-reduce after combining outputs for multi-GPU support.
-        if not self.enable_attention_dp and self.mapping.tp_size > 1:
-            final_hidden_states = self.allreduce(final_hidden_states)
+        if self.allreduce is not None:
+            final_hidden_states = self.allreduce(
+                final_hidden_states,
+                all_reduce_params=kwargs.get('all_reduce_params'))
 
         return final_hidden_states.view(orig_shape)
 
@@ -341,6 +346,7 @@ def __init__(
         # * -> TransformerLayer
         layer_type: str,
         aux_stream_dict: dict[AuxStreamType, torch.cuda.Stream],
+        fuse_allreduce_norm: bool = False,
     ):
         super().__init__()
 
@@ -373,6 +379,13 @@ def __init__(
             )
             self.is_nvfp4 = False
 
+        # fuse_allreduce_norm is the model-level flag.  When enabled, ALL
+        # layers defer mixer AllReduce to the next layer's pre_allreduce (or
+        # the model's final_allreduce).  Only layers 1+ create a pre_allreduce
+        # module; layer 0's input is already reduced from the embedding.
+        self.fuse_allreduce_norm = fuse_allreduce_norm
+        self.is_moe_layer = (layer_type == "E")
+
         self.norm = RMSNorm(
             hidden_size=config.hidden_size,
             eps=config.rms_norm_eps,
@@ -382,9 +395,22 @@ def __init__(
             quantize_type="nvfp4" if self.is_nvfp4 else None,
             # Enable high precision output for MoE layer (only with NVFP4).
             # It might be overridden in `_try_attach_nvfp4_scale` function.
-            return_hp_output=layer_type == "E" and self.is_nvfp4,
+            return_hp_output=self.is_moe_layer and self.is_nvfp4,
         )
 
+        if fuse_allreduce_norm and layer_idx > 0:
+            self.pre_allreduce = AllReduce(
+                mapping=model_config.mapping,
+                strategy=model_config.allreduce_strategy,
+            )
+
+        # Mixer creation.  The fuse_allreduce_norm optimization is orthogonal
+        # to AllReduce topology: Transformer/MoE gate it at forward time via
+        # AllReduceParams; MLP/Mamba gate it at init time via reduce_output
+        # (their base classes don't thread all_reduce_params through forward).
+        has_tp_allreduce = (not model_config.mapping.enable_attention_dp
+                            and model_config.mapping.tp_size > 1)
+
         if layer_type == "M":
             self.mixer = Mamba2Mixer(
                 d_model=config.hidden_size,
@@ -399,19 +425,27 @@ def __init__(
                 dtype=config.torch_dtype,
                 config=model_config,
             )
+            if fuse_allreduce_norm:
+                self.mixer.out_proj.reduce_output = False
         elif layer_type == "-":
-            self.mixer = MLPLayer(model_config, layer_idx)
+            self.mixer = MLPLayer(
+                model_config,
+                layer_idx,
+                reduce_output=not fuse_allreduce_norm,
+            )
         elif layer_type == "*":
             self.mixer = TransformerLayer(
                 model_config,
                 layer_idx,
-                reduce_output=not model_config.mapping.enable_attention_dp
-                and model_config.mapping.tp_size > 1,
+                reduce_output=has_tp_allreduce,
             )
         elif layer_type == "E":
-            self.mixer = NemotronHMOE(model_config,
-                                      layer_idx=layer_idx,
-                                      aux_stream_dict=aux_stream_dict)
+            self.mixer = NemotronHMOE(
+                model_config,
+                layer_idx=layer_idx,
+                aux_stream_dict=aux_stream_dict,
+                reduce_output=has_tp_allreduce,
+            )
         else:
             raise ValueError(f"{layer_type} is not supported")
 
@@ -436,7 +470,7 @@ def _try_attach_nvfp4_scale(self):
 
         # Special handling for MoE layer: fetch shared_expert.up_proj.input_scale
         # as representation of the input scale.
-        if self.layer_type == "E":
+        if self.is_moe_layer:
             if (hasattr(self.mixer, "shared_experts")
                     and self.mixer.shared_experts is not None
                     and hasattr(self.mixer.shared_experts, "up_proj")
@@ -463,16 +497,50 @@ def forward(
         if residual is None:
             residual = torch.zeros_like(hidden_states)
 
-        if self.norm.return_hp_output:
+        if hasattr(self, 'pre_allreduce'):
+            norm = self.norm
+            has_nvfp4_scale = hasattr(norm, 'nvfp4_scale')
+            if norm.is_nvfp4 and has_nvfp4_scale and norm.return_hp_output:
+                fusion_op = AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4
+            elif norm.is_nvfp4 and has_nvfp4_scale:
+                fusion_op = AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4
+            else:
+                fusion_op = AllReduceFusionOp.RESIDUAL_RMS_NORM
+            all_reduce_params = AllReduceParams(
+                fusion_op=fusion_op,
+                residual=residual,
+                norm_weight=norm.weight,
+                eps=norm.variance_epsilon,
+                trigger_completion_at_end=False,
+                **(dict(scale=norm.nvfp4_scale)
+                   if has_nvfp4_scale and norm.is_nvfp4 else {}),
+            )
+            result = self.pre_allreduce(hidden_states,
+                                        all_reduce_params=all_reduce_params)
+            if fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4:
+                norm_out, act_fp4, act_sf, residual = result
+                hidden_states = (Fp4QuantizedTensor(act_fp4, act_sf), norm_out)
+            elif fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4:
+                act_fp4, act_sf, residual = result
+                hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
+            else:
+                hidden_states, residual = result
+        elif self.norm.return_hp_output:
             hidden_states, residual, high_precision_normed_output = self.norm(
                 hidden_states, residual)
             hidden_states = (hidden_states, high_precision_normed_output)
         else:
             hidden_states, residual = self.norm(hidden_states, residual)
-        hidden_states = self.mixer(hidden_states,
-                                   attn_metadata,
-                                   spec_metadata=spec_metadata,
-                                   **kwargs)
+
+        # When fuse_allreduce_norm is active, tell Transformer/MoE mixers to
+        # skip their own AllReduce (it is handled by pre_allreduce /
+        # final_allreduce instead).  MLP/Mamba ignore this kwarg; their
+        # reduce_output was set at init time.
+        mixer_kwargs = dict(spec_metadata=spec_metadata, **kwargs)
+        if self.fuse_allreduce_norm:
+            mixer_kwargs['all_reduce_params'] = AllReduceParams(
+                enable_allreduce=False)
+        hidden_states = self.mixer(hidden_states, attn_metadata, **mixer_kwargs)
 
         if spec_metadata is not None and spec_metadata.is_layer_capture(
                 self.layer_idx):
@@ -519,14 +587,20 @@ def __init__(self, model_config: ModelConfig[NemotronHConfig]):
                 gather_output=True,
             )
 
+        self.fuse_allreduce_norm = (not model_config.mapping.enable_attention_dp
+                                    and model_config.mapping.tp_size > 1)
+
         # create layers
         layers = []
         for layer_idx, layer_type in enumerate(config.hybrid_override_pattern):
             layers.append(
-                NemotronHLayer(model_config,
-                               layer_idx,
-                               layer_type,
-                               aux_stream_dict=self.aux_stream_dict))
+                NemotronHLayer(
+                    model_config,
+                    layer_idx,
+                    layer_type,
+                    aux_stream_dict=self.aux_stream_dict,
+                    fuse_allreduce_norm=self.fuse_allreduce_norm,
+                ))
         self.layers = nn.ModuleList(layers)
         self.num_hidden_layers = config.num_hidden_layers
 
@@ -537,6 +611,13 @@ def __init__(self, model_config: ModelConfig[NemotronHConfig]):
             dtype=config.torch_dtype,
         )
 
+        # AllReduce for fusing with final norm (after last layer's mixer)
+        if self.fuse_allreduce_norm:
+            self.final_allreduce = AllReduce(
+                mapping=model_config.mapping,
+                strategy=model_config.allreduce_strategy,
+            )
+
     def forward(
         self,
         attn_metadata: AttentionMetadata,
@@ -567,7 +648,19 @@ def forward(
                 spec_metadata=spec_metadata,
                 mamba_metadata=mamba_metadata,
             )
-        hidden_states, _ = self.norm_f(hidden_states, residual)
+
+        if self.fuse_allreduce_norm:
+            hidden_states, _ = self.final_allreduce(
+                hidden_states,
+                all_reduce_params=AllReduceParams(
+                    fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
+                    residual=residual,
+                    norm_weight=self.norm_f.weight,
+                    eps=self.norm_f.variance_epsilon,
+                    trigger_completion_at_end=False,
+                ))
+        else:
+            hidden_states, _ = self.norm_f(hidden_states, residual)
         return hidden_states
 
 
 
@@ -139,8 +139,13 @@ def __init__(
 
         # Choose between flashinfer and native implementation. (default to flashinfer)
         self._mamba_ssm_cache_dtype = config.quant_config.mamba_ssm_cache_dtype
-        supported_head_dim_in_flashinfer = [64, 128]
-        self._use_flashinfer = head_dim in supported_head_dim_in_flashinfer
+        # TODO: Update head_dims and head_group_ratios once flashinfer is updated.
+        supported_head_dims = [64, 128]
+        supported_head_group_ratios = [1, 8, 16]
+        head_group_ratio = (self.tp_nheads //
+                            self.tp_ngroups if self.tp_ngroups > 0 else 0)
+        self._use_flashinfer = (head_dim in supported_head_dims and
+                                head_group_ratio in supported_head_group_ratios)
         # Stochastic rounding requires FlashInfer and fp16 cache
         self._use_stochastic_rounding = (
             config.quant_config.mamba_ssm_stochastic_rounding
 
@@ -1335,6 +1335,7 @@ def create_py_executor_instance(
     waiting_queue_policy = (scheduler_config.waiting_queue_policy
                             if scheduler_config is not None else
                             WaitingQueuePolicy.FCFS)
+
     return PyExecutor(
         resource_manager,
         scheduler,
 
@@ -67,7 +67,7 @@
 from .scheduler import (RequestScheduler, ScheduledRequests,
                         SerializableSchedulerOutput, WaitingQueue,
                         create_waiting_queue)
-from .scheduler.adp_router import ADPRouter, DefaultADPRouter
+from .scheduler.adp_router import ADPRouter
 
 # Environment variable to specify iteration ranges for profiling start/stop.
 # Format: "start1-stop1,start2-stop2,..." or single iterations "iter1,iter2,..."
@@ -285,8 +285,7 @@ def __init__(
             virtual_memory_pools: Optional[dict] = None,
             hang_detection_timeout: Optional[int] = None,
             execution_stream: Optional[torch.cuda.Stream] = None,
-            waiting_queue_policy: WaitingQueuePolicy = WaitingQueuePolicy.FCFS,
-            adp_router: Optional[ADPRouter] = None):
+            waiting_queue_policy: WaitingQueuePolicy = WaitingQueuePolicy.FCFS):
         super(PyExecutor, self).__init__()
         self.device_id = torch.cuda.current_device()
         self.global_rank = dist.rank
@@ -313,7 +312,6 @@ def __init__(
         self.model_engine = model_engine
         self.enable_attention_dp = model_engine.enable_attention_dp
         self.dist = dist
-        self.adp_router: ADPRouter = (adp_router or DefaultADPRouter(dist=dist))
         self.sampler = sampler
         self.drafter = drafter
         self.draft_model_engine = getattr(self.drafter, "draft_model_engine",
@@ -387,6 +385,12 @@ def __init__(
             self.enable_kv_cache_reuse
             and self.kv_cache_manager.enable_partial_reuse)
 
+        self.adp_router: ADPRouter = ADPRouter.create(
+            dist=self.dist,
+            kv_cache_manager=self.kv_cache_manager,
+            attention_dp_config=self.llm_args.attention_dp_config,
+        )
+
         self.max_input_len = max_input_len
         # _executor_loop private data
         self.max_num_active_requests = model_engine.get_max_num_sequences()
@@ -2573,6 +2577,9 @@ def _fetch_new_requests(
 
         # 6. Schedule requests across ranks (DP only)
         if self.enable_attention_dp:
+            if self.adp_router.needs_prefix_matches:
+                self.adp_router.gather_prefix_matches(new_requests)
+
             all_ranks_new_requests, self.expected_num_active_requests = \
                 self.adp_router.route_requests(
                     all_rank_states, new_requests,
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optiona`
`68`	`68`	`class PyKvCacheManager : public tbk::BaseKVCacheManager`
`69`	`69`	`{`
`70`	`70`	`public:`
`71`		`- NB_TRAMPOLINE(tbk::BaseKVCacheManager, 30);`
	`71`	`+ NB_TRAMPOLINE(tbk::BaseKVCacheManager, 36);`
`72`	`72`
`73`	`73`	`// using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors`
`74`	`74`	`void allocatePools(bool useUvm = false) override`
`@@ -255,6 +255,12 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager`
`255`	`255`	`{`
`256`	`256`	`NB_OVERRIDE_PURE(flushIterationEvents);`
`257`	`257`	`}`
	`258`	`+`
	`259`	`+ SizeType32 countReusableBlocks(VecUniqueTokens const& uniqueTokens, tb::LlmRequest const& llmRequest,`
	`260`	`+ bool onlyAllocated = false) const override`
	`261`	`+ {`
	`262`	`+ NB_OVERRIDE_PURE(countReusableBlocks, uniqueTokens, llmRequest, onlyAllocated);`
	`263`	`+ }`
`258`	`264`	`};`
`259`	`265`
`260`	`266`	`// TODO: Deduplicate executor bindings KvCacheStats`