NVIDIA
diff --git a/‎tensorrt_llm/_torch/model_config.py‎
Lines changed: 5 additions & 0 deletions b/‎tensorrt_llm/_torch/model_config.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 16 additions & 1 deletion b/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/models/modeling_gpt_oss.py‎
Lines changed: 12 additions & 0 deletions b/‎tensorrt_llm/_torch/models/modeling_gpt_oss.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_hunyuan_moe.py‎
Lines changed: 14 additions & 2 deletions b/‎tensorrt_llm/_torch/models/modeling_hunyuan_moe.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_utils.py‎
Lines changed: 22 additions & 0 deletions b/‎tensorrt_llm/_torch/models/modeling_utils.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/communication/__init__.py‎
Lines changed: 6 additions & 6 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/communication/__init__.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/communication/base.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/communication/base.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py‎
Lines changed: 29 additions & 29 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py‎
Lines changed: 29 additions & 29 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep.py‎
Lines changed: 5 additions & 2 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py‎
Lines changed: 5 additions & 2 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py‎
Lines changed: 5 additions & 2 deletions
@@ -165,6 +165,11 @@ def get_all_reduce_strategy(strategy: str = "AUTO"):
             self.allreduce_strategy = get_all_reduce_strategy(
                 self.allreduce_strategy)
 
+        # Set default moe_max_num_tokens if not specified
+        # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
+        if self.moe_max_num_tokens is None:
+            self.moe_max_num_tokens = self.max_num_tokens * self.mapping.dp_size
+
     @property
     def torch_dtype(self) -> torch.dtype:
         """Get the torch dtype of the model."""
 
@@ -55,7 +55,7 @@
 from ..modules.attention import MLA
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
-from ..modules.fused_moe import (DeepSeekV3MoeRoutingMethod,
+from ..modules.fused_moe import (DeepSeekV3MoeRoutingMethod, MoE,
                                  MoEWeightLoadingMode, create_moe)
 from ..modules.fused_moe.fused_moe_wide_ep import WideEPMoE
 from ..modules.gated_mlp import GatedMLP
@@ -382,6 +382,21 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
                         "gate_proj": "w1",
                     })
                     module.load_weights(weights=[module_weights])
+                elif names[-1] == "backend" and isinstance(module, MoE):
+                    # Special case: ConfigurableMoE.backend (TRTLLMGenFusedMoE)
+                    # Currently saved MoE weights don't include 'backend' in their names.
+                    # After MoE refactoring, ConfigurableMoE now has a backend submodule,
+                    # and weights loading is done in the backend, so module name includes '.backend'.
+                    # We need to use parent module name (without .backend) to match saved weight names.
+                    # After MoE refactoring is fully complete, all paths will follow this branch.
+                    parent_name = '.'.join(names[:-1])
+                    module_weights = filter_weights(parent_name, weights)
+                    module_weights = rename_moe_weight(module_weights, {
+                        "down_proj": "w2",
+                        "up_proj": "w3",
+                        "gate_proj": "w1",
+                    })
+                    module.load_weights(weights=[module_weights])
                 elif names[-1] == "self_attn":
                     continue
                 elif names[-1] == "next_layer_layernorm":
 
@@ -657,6 +657,18 @@ def load_hf_weights(self, weights: Dict):
             module_weights = {}
             for k, v in self.hf_params_map.items():
                 name = name.replace(k, v)
+
+            # Special case: ConfigurableMoE.backend (TRTLLMGenFusedMoE)
+            # Currently saved MoE weights don't include 'backend' in their names.
+            # After MoE refactoring, ConfigurableMoE now has a backend submodule,
+            # and weights loading is done in the backend, so module name includes '.backend'.
+            # We need to use parent module name (without .backend) to match saved weight names.
+            # After MoE refactoring is fully complete, all paths will follow this branch.
+            names = name.split('.')
+            if names[-1] == "backend" and isinstance(module, MoE):
+                # Backend is under experts module (ConfigurableMoE wrapper)
+                name = '.'.join(names[:-1])
+
             module_weights = filter_weights(name, weights)
 
             if isinstance(module, MoE):
 
@@ -15,8 +15,9 @@
 from ..modules.attention import Attention
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
-from ..modules.fused_moe import (CutlassFusedMoE, RenormalizeMoeRoutingMethod,
-                                 VanillaMoE, create_moe)
+from ..modules.fused_moe import (CutlassFusedMoE, MoE,
+                                 RenormalizeMoeRoutingMethod, VanillaMoE,
+                                 create_moe)
 from ..modules.gated_mlp import GatedMLP
 from ..modules.linear import Linear, TensorParallelMode
 from ..modules.multi_stream_utils import maybe_execute_in_parallel
@@ -364,6 +365,17 @@ def filter_weights(prefix, weights: Dict):
                         "lm_head"):
                     continue
                 names = name.split('.')
+
+                # Special case: ConfigurableMoE.backend (TRTLLMGenFusedMoE)
+                # Currently saved MoE weights don't include 'backend' in their names.
+                # After MoE refactoring, ConfigurableMoE now has a backend submodule,
+                # and weights loading is done in the backend, so module name includes '.backend'.
+                # We need to use parent module name (without .backend) to match saved weight names.
+                # After MoE refactoring is fully complete, all paths will follow this branch.
+                if names[-1] == "backend" and isinstance(module, MoE):
+                    name = '.'.join(names[:-1])
+                    names = name.split('.')
+
                 if names[-1] in params_map:
                     # model.layers.{idx}.mlp.shared_mlp.gate_up_proj or model.layers.{idx}.self_attn.qkv_proj
                     module_weights = []
 
@@ -863,6 +863,17 @@ def load_single_module(name, module):
                 return
 
             names = name.split('.')
+
+            # Special case: ConfigurableMoE.backend (TRTLLMGenFusedMoE)
+            # Currently saved MoE weights don't include 'backend' in their names.
+            # After MoE refactoring, ConfigurableMoE now has a backend submodule,
+            # and weights loading is done in the backend, so module name includes '.backend'.
+            # We need to use parent module name (without .backend) to match saved weight names.
+            # After MoE refactoring is fully complete, all paths will follow this branch.
+            if names[-1] == "backend" and isinstance(module, MoE):
+                name = '.'.join(names[:-1])
+                names = name.split('.')
+
             # WAR: better solution is that llama has its own load_weights function.
             if names[-1] == 'next_layer_layernorm':
                 return
@@ -956,6 +967,17 @@ def load_single_module(name, module):
                 return
 
             names = name.split('.')
+
+            # Special case: ConfigurableMoE.backend (TRTLLMGenFusedMoE)
+            # Currently saved MoE weights don't include 'backend' in their names.
+            # After MoE refactoring, ConfigurableMoE now has a backend submodule,
+            # and weights loading is done in the backend, so module name includes '.backend'.
+            # We need to use parent module name (without .backend) to match saved weight names.
+            # After MoE refactoring is fully complete, all paths will follow this branch.
+            if names[-1] == "backend" and isinstance(module, MoE):
+                name = '.'.join(names[:-1])
+                names = name.split('.')
+
             module_names_breakdown, module_name = names[:-1], names[-1]
 
             if weight_mapper.does_require_special_handling(module_name):
 
@@ -20,8 +20,8 @@
 
 Available Communication Methods:
 - AllGatherReduceScatter: Default fallback method, always available
-- MnnvlLatency: MNNVL-optimized communication for latency
-- MNNVLThroughput: MNNVL-optimized communication for throughput
+- NVLinkTwoSided: NVLINK-optimized communication for latency (formerly MNNVLLatency)
+- NVLinkOneSided: NVLINK-optimized communication for throughput (formerly MNNVLThroughput)
 - DeepEP: Deep Expert Parallelism with support for large batches
 - DeepEPLowLatency: Deep Expert Parallelism optimized for low latency
 
@@ -34,16 +34,16 @@
 from .communication_factory import CommunicationFactory
 from .deep_ep import DeepEP
 from .deep_ep_low_latency import DeepEPLowLatency
-from .mnnvl_latency import MnnvlLatency
-from .mnnvl_throughput import MNNVLThroughput
+from .nvlink_one_sided import NVLinkOneSided
+from .nvlink_two_sided import NVLinkTwoSided
 
 __all__ = [
     # Base classes and types
     "Communication",
     # Communication strategies
     "AllGatherReduceScatter",
-    "MnnvlLatency",
-    "MNNVLThroughput",
+    "NVLinkTwoSided",
+    "NVLinkOneSided",
     "DeepEP",
     "DeepEPLowLatency",
     # Factory
 
@@ -49,6 +49,7 @@ def __init__(
         self.mapping = mapping
         self.ep_size = mapping.moe_ep_size
         self.ep_rank = mapping.moe_ep_rank
+        self._is_platform_supported = False
 
     @abstractmethod
     def is_workload_feasible(
 
@@ -32,8 +32,8 @@
 from .base import Communication
 from .deep_ep import DeepEP
 from .deep_ep_low_latency import DeepEPLowLatency
-from .mnnvl_latency import MnnvlLatency
-from .mnnvl_throughput import MNNVLThroughput
+from .nvlink_one_sided import NVLinkOneSided
+from .nvlink_two_sided import NVLinkTwoSided
 
 
 def is_high_throughput() -> bool:
@@ -72,7 +72,7 @@ class CommunicationFactory:
     Factory for creating MoE communication methods
 
     Selects the best communication method based on:
-    - Hardware support (MNNVL, DeepEP)
+    - Hardware support (NVLINK, DeepEP)
     - Configuration settings
     - Workload characteristics
     """
@@ -85,16 +85,17 @@ def create_strategy(
         top_k: int,
         expert_size_per_partition: int,
         payload_in_workspace: bool = False,
-        alltoall_result_do_sum: bool = False,
+        alltoall_result_do_sum: bool = True,
     ) -> Optional[Communication]:
         """
         Create the best communication method for the given configuration
 
         Selection priority:
-        1. Force method (if specified via TRTLLM_FORCE_ALLTOALL_METHOD env)
-        2. MNNVL (if hardware supports)
+        1. Force method (if specified via TRTLLM_FORCE_COMM_METHOD env)
+        2. NVLINK (if hardware supports)
            - Selects latency or throughput backend based on TRTLLM_MOE_ALLTOALL_BACKEND env
-           - Default: "mnnvllatency", alternative: "mnnvlthroughput"
+           - Default: "NVLinkTwoSided", legacy: "mnnvllatency"
+           - Alternative: "NVLinkOneSided", legacy: "mnnvlthroughput"
         3. DeepEP / DeepEPLowLatency (if enabled and hardware supports)
         4. AllGather + ReduceScatter (fallback, always works)
 
@@ -104,8 +105,8 @@ def create_strategy(
             num_slots: Total number of expert slots
             top_k: Number of experts per token
             expert_size_per_partition: Number of experts per partition (required for DeepEP)
-            payload_in_workspace: If True, final_hidden_states is already in workspace (for MNNVLThroughput)
-            alltoall_result_do_sum: If True, sum the alltoall results (for MnnvlLatency)
+            payload_in_workspace: If True, final_hidden_states is already in workspace (for NVLinkOneSided)
+            alltoall_result_do_sum: If True, sum the alltoall results (for NVLinkTwoSided)
 
         Returns:
             The selected communication method, or None if attention does not use DP
@@ -134,19 +135,19 @@ def create_strategy(
             return AllGatherReduceScatter(mapping)
 
         # Check if forced method is specified via environment variable
-        force_method = os.environ.get("TRTLLM_FORCE_ALLTOALL_METHOD")
+        force_method = os.environ.get("TRTLLM_FORCE_COMM_METHOD")
 
         if force_method is not None:
             # Validate platform support for forced method
             method_upper = force_method.upper()
-            if method_upper in ["MNNVLLATENCY", "MNNVLTHROUGHPUT"]:
-                if not MnnvlLatency.is_platform_supported():
+            if method_upper in ["NVLINK_TWO_SIDED", "NVLINK_ONE_SIDED"]:
+                if not NVLinkTwoSided.is_platform_supported():
                     raise RuntimeError(
                         f"Forced method '{force_method}' is not supported on this platform. "
-                        "MNNVLLATENCY and MNNVLTHROUGHPUT require compatible hardware."
+                        "NVLINK two-sided and one-sided modes require compatible hardware."
                     )
             elif method_upper in ["DEEPEP", "DEEPEPLOWLATENCY"]:
-                if not DeepEP.is_platform_supported(mapping):
+                if not DeepEP.is_platform_supported():
                     raise RuntimeError(
                         f"Forced method '{force_method}' is not supported on this platform. "
                         "DeepEP requires compatible hardware and TRTLLM_CAN_USE_DEEP_EP=1."
@@ -163,19 +164,20 @@ def create_strategy(
                 alltoall_result_do_sum,
             )
 
-        # Try MNNVL first (highest priority)
-        if MnnvlLatency.is_platform_supported():
+        # Try NVLINK first (highest priority)
+        if NVLinkTwoSided.is_platform_supported():
+            # TODO: update when we have a more clear heuristic.
             if is_high_throughput():
-                # Currently, MNNVLThroughput shows better performance at all scenarios
-                return MNNVLThroughput(
+                # Currently, NVLinkOneSided shows better performance at all scenarios
+                return NVLinkOneSided(
                     mapping,
-                    num_experts,
+                    num_slots,
                     top_k,
                     max_num_tokens_per_rank=max_num_tokens,
                     payload_in_workspace=payload_in_workspace,
                 )
             else:
-                return MnnvlLatency(
+                return NVLinkTwoSided(
                     mapping,
                     num_experts,
                     num_slots,
@@ -187,9 +189,7 @@ def create_strategy(
         # Try DeepEP
         if os.environ.get("TRTLLM_CAN_USE_DEEP_EP", "0") == "1":
             if weight_dtype == torch.bfloat16:
-                if DeepEP.is_platform_supported(mapping) and is_deepep_feasible(
-                    mapping.moe_ep_size
-                ):
+                if DeepEP.is_platform_supported() and is_deepep_feasible(mapping.moe_ep_size):
                     return DeepEP(
                         mapping,
                         num_slots,
@@ -240,21 +240,21 @@ def _create_forced_method(
 
         method = method.upper()
 
-        if method == "MNNVLLATENCY":
-            return MnnvlLatency(
+        if method in ["NVLINK_TWO_SIDED"]:
+            return NVLinkTwoSided(
                 mapping,
                 num_experts,
                 num_slots,
                 top_k,
                 use_low_precision_combine,
                 alltoall_result_do_sum=alltoall_result_do_sum,
             )
-        elif method == "MNNVLTHROUGHPUT":
-            # MNNVLThroughput requires max_num_tokens_per_rank
+        elif method in ["NVLINK_ONE_SIDED"]:
+            # NVLinkOneSided requires max_num_tokens_per_rank
             # max_num_tokens is per-rank value (as passed from callers like cutlass)
-            return MNNVLThroughput(
+            return NVLinkOneSided(
                 mapping,
-                num_experts,
+                num_slots,
                 top_k,
                 max_num_tokens_per_rank=max_num_tokens,
                 payload_in_workspace=payload_in_workspace,
 
@@ -66,8 +66,11 @@ def __init__(
         self.deep_ep_buffer = buffer_pool.get_buffer(mapping)
         self.deep_ep_buffer.reserve(hidden_size, weight_dtype)
 
+        # Initialize platform support check result
+        self._is_platform_supported = self.is_platform_supported()
+
     @staticmethod
-    def is_platform_supported(mapping: Mapping) -> bool:
+    def is_platform_supported() -> bool:
         """
         Check if DeepEP is supported on the current platform
         """
@@ -94,7 +97,7 @@ def is_workload_feasible(self, all_rank_num_tokens: List[int], num_chunks: int)
             return False
         if self.weight_dtype != torch.bfloat16:
             return False
-        return self.is_platform_supported(self.mapping)
+        return self._is_platform_supported
 
     def dispatch(
         self,
 
@@ -79,8 +79,11 @@ def __init__(
         self.deep_ep_buffer = buffer_pool.get_low_latency_buffer(mapping)
         self.deep_ep_buffer.reserve(self.deep_ep_max_num_tokens, hidden_size, num_slots)
 
+        # Initialize platform support check result
+        self._is_platform_supported = self.is_platform_supported()
+
     @staticmethod
-    def is_platform_supported(mapping: Mapping) -> bool:
+    def is_platform_supported() -> bool:
         """
         Check if DeepEP Low Latency is supported on the current platform
         """
@@ -113,7 +116,7 @@ def is_workload_feasible(self, all_rank_num_tokens: List[int], num_chunks: int)
             return False
         if self.weight_dtype != torch.bfloat16:
             return False
-        return self.is_platform_supported(self.mapping)
+        return self._is_platform_supported
 
     def dispatch(
         self,