Support Qwen MOE with the TRT LLM C++ route (#155)

kevalmorabia97 · michaelfeil · yeyu-nvidia · commit 07bfee5da373 · 2025-12-08T10:51:06.000-08:00
Co-authored-by: michaelfeil &lt;63565275+michaelfeil@users.noreply.github.com&gt;
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
@@ -330,6 +330,7 @@ def is_moe(module: nn.Module) -> bool:
         "MoELayer".lower(),
         "PhimoeSparseMoeBlock".lower(),
         "DeepseekMoE".lower(),
+        "Qwen2MoeSparseMoeBlock".lower(),
     ]
 
 
@@ -594,7 +595,7 @@ def build_fused_linear_config(modules: list[nn.Module], linear_type: str) -> Lin
     config = build_linear_config(modules[0], linear_type=linear_type)
     config.weight = torch.cat([module.weight for module in modules], dim=0)
 
-    if config.weights_scaling_factor.numel() != 1:
+    if config.weights_scaling_factor is not None and config.weights_scaling_factor.numel() != 1:
         config.weights_scaling_factor = torch.cat(
             [get_weight_scaling_factor(module) for module in modules], dim=0
         )
@@ -710,7 +711,7 @@ def build_mlp_config(
     """Builds the MLP config for the module."""
     assert is_mlp(module)
 
-    config = MLPConfig()
+    config = MLPConfig(merge_gate_fc=merge_gate_fc)
 
     def _split_gate_from_fc(decoder_type, module, fc_name, fc_layer):
         if (
@@ -833,11 +834,7 @@ def _split_gate_from_fc(decoder_type, module, fc_name, fc_layer):
                 weight_quantizer.amax = torch.cat([amax_chunks[1], amax_chunks[0]], dim=0)
 
         split_gate = _split_gate_from_fc(decoder_type, module, name, fc_linear)
-        if merge_gate_fc:
-            config.fc = build_fused_linear_config([gate_linear, fc_linear], LINEAR_COLUMN)
-            gate_linear = None
-
-        elif split_gate:
+        if split_gate:
             # We have to split the gate from the fc
             weights = torch.chunk(fc_linear.weight, 2, dim=0)
             weight_scaling_factor = get_weight_scaling_factor(fc_linear)
@@ -1104,7 +1101,7 @@ def build_stacked_experts(
 def build_moe_config(module: nn.Module, decoder_type) -> MOEConfig:
     """Builds the MOE config for the module."""
     assert is_moe(module)
-    assert decoder_type in ["llama", "dbrx", "phi3", "deepseek"]
+    assert decoder_type in ["llama", "dbrx", "phi3", "deepseek", "qwen"]
 
     config = MOEConfig()
 
@@ -1128,10 +1125,19 @@ def build_moe_config(module: nn.Module, decoder_type) -> MOEConfig:
         config.shared_expert = build_mlp_config(
             module.shared_experts, decoder_type, merge_gate_fc=True
         )
+    elif decoder_type == "qwen":
+        config.router = build_linear_config(module.gate, LINEAR_ROW)
+        preprocess_linear_fusion([module.shared_expert.gate_proj, module.shared_expert.up_proj])
+        config.shared_expert = build_mlp_config(
+            module.shared_expert, decoder_type, merge_gate_fc=True
+        )
+        config.shared_expert_gate = build_linear_config(module.shared_expert_gate, LINEAR_ROW)
+        config.shared_expert_gate.tp = False
     else:
         raise NotImplementedError(f"{decoder_type} not supported")
 
     config.router.weight = config.router.weight.type(torch.float)
+    config.router.tp = False
 
     # Experts
     experts = ExpertConfig()
@@ -1168,7 +1174,7 @@ def build_moe_config(module: nn.Module, decoder_type) -> MOEConfig:
             len(module.experts.mlp.w1_linear),
             _get_dbrx_expert,
         )
-    elif decoder_type == "deepseek":
+    elif decoder_type in ["deepseek", "qwen"]:
         experts.fc, experts.proj = build_stacked_experts(
             module.experts,
             ["gate_proj", "down_proj", "up_proj"],
diff --git a/modelopt/torch/export/model_config.py b/modelopt/torch/export/model_config.py
@@ -95,6 +95,9 @@ class LinearConfig:
     prequant_scaling_factor: torch.Tensor = None
     awq_block_size: int = 0
 
+    # If set to false, we do not split or merge this config during post tp processing.
+    tp: bool = True
+
     def __del__(self):
         del self.weight
         del self.bias
@@ -320,6 +323,7 @@ class MLPConfig:
     gate: LinearConfig = None
     proj: LinearConfig = None
     hidden_act: str = ""
+    merge_gate_fc: bool = False
 
 
 @dataclass
@@ -359,6 +363,7 @@ class MOEConfig:
     router: LinearConfig = None
     experts: ExpertConfig = None
     shared_expert: MLPConfig = None  # Deepseek MOE
+    shared_expert_gate: LinearConfig = None  # Qwen MOE
     hidden_act: str = ""
 
     @property
diff --git a/modelopt/torch/export/model_config_export.py b/modelopt/torch/export/model_config_export.py
@@ -53,6 +53,7 @@
 )
 from .model_config import QUANTIZATION_INT4_AWQ, QUANTIZATION_W4A8_AWQ, ModelConfig
 from .model_config_utils import (
+    merge_gate_fc,
     merge_qkv,
     model_config_to_dict,
     pack_linear_weights,
@@ -409,6 +410,7 @@ def torch_to_tensorrt_llm_checkpoint(
             assert model_config.rank >= 0, "Invalid model_config, postprocess_model_config fails."
 
             merge_qkv(model_config)
+            merge_gate_fc(model_config)
             pack_linear_weights(model_config)
 
             weights = {}
diff --git a/modelopt/torch/export/model_config_utils.py b/modelopt/torch/export/model_config_utils.py
@@ -23,10 +23,7 @@
 import torch
 
 from .model_config import (
-    QUANTIZATION_FP8,
     QUANTIZATION_INT4_AWQ,
-    QUANTIZATION_NVFP4,
-    QUANTIZATION_NVFP4_AWQ,
     QUANTIZATION_W4A8_AWQ,
     DecoderLayerConfig,
     LayernormConfig,
@@ -272,6 +269,38 @@ def merge_qkv(model_config):
                 del splitted_qkv
 
 
+def merge_gate_fc(model_config):
+    """Postprocess the MLP config for TensorRT-LLM export."""
+    for decoder_config in model_config.layers:
+        mlp = None
+        if isinstance(decoder_config.mlp, MLPConfig):
+            mlp = decoder_config.mlp
+        elif (
+            isinstance(decoder_config.mlp, MOEConfig)
+            and decoder_config.mlp.shared_expert is not None
+        ):
+            mlp = decoder_config.mlp.shared_expert
+
+        if mlp is not None and mlp.merge_gate_fc and mlp.gate is not None and mlp.fc is not None:
+            mlp.fc.weight = torch.cat(
+                [
+                    mlp.gate.weight,
+                    mlp.fc.weight,
+                ],
+                dim=0,
+            )
+
+            if (
+                mlp.fc.weights_scaling_factor is not None
+                and mlp.fc.weights_scaling_factor.numel() > 1
+            ):
+                mlp.fc.weights_scaling_factor = torch.cat(
+                    [mlp.gate.weights_scaling_factor, mlp.fc.weights_scaling_factor], dim=0
+                )
+
+            mlp.gate = None
+
+
 def pack_linear_weights(model_config: ModelConfig):
     """Packs the quantized linear weights in the model_config to the quantized format."""
 
@@ -314,43 +343,39 @@ def _linear_layer_to_quantized_weight(linear_layers):
     if not model_config.quantization:
         return
 
-    attention_key_list = ["attention", "self_attention", "cross_attention"]
-    for decoder_config in model_config.layers:
-        linear_layers = []
-        if any([hasattr(decoder_config, attention_key) for attention_key in attention_key_list]):
-            for attention_key in attention_key_list:
-                attention = getattr(decoder_config, attention_key, None)
-                if attention:
-                    linear_layers += [
-                        attention.qkv,
-                        attention.dense,
-                    ]
-        if decoder_config.recurrent:
-            linear_layers = [
-                decoder_config.recurrent.linear_y,
-                decoder_config.recurrent.linear_x,
-                decoder_config.recurrent.linear_out,
-            ]
-
-        if isinstance(decoder_config.mlp, MOEConfig):
-            if model_config.quantization not in [
-                QUANTIZATION_FP8,
-                QUANTIZATION_INT4_AWQ,
-                QUANTIZATION_NVFP4,
-                QUANTIZATION_NVFP4_AWQ,
-            ]:
-                raise NotImplementedError(
-                    f"MOE quantization for {model_config.quantization} is not supported yet."
-                )
-            else:
-                linear_layers.append(decoder_config.mlp.experts.fc)
-                linear_layers.append(decoder_config.mlp.experts.proj)
-        elif decoder_config.mlp is not None:
-            linear_layers.append(decoder_config.mlp.fc)
-            linear_layers.append(decoder_config.mlp.proj)
-            linear_layers.append(decoder_config.mlp.gate)
+    def _find_linear_configs_recursive(model_config):
+        linear_configs = []
 
-        _linear_layer_to_quantized_weight(linear_layers)
+        # Base case - not a dataclass
+        if not dataclasses.is_dataclass(model_config):
+            return linear_configs
+
+        # Check if current object is a LinearConfig
+        if isinstance(model_config, LinearConfig):
+            linear_configs.append(model_config)
+            return linear_configs
+
+        # Recursively check all fields
+        for field in dataclasses.fields(model_config):
+            value = getattr(model_config, field.name)
+
+            if isinstance(value, list):
+                for item in value:
+                    linear_configs.extend(_find_linear_configs_recursive(item))
+
+            elif isinstance(value, dict):
+                for _, item in value.items():
+                    linear_configs.extend(_find_linear_configs_recursive(item))
+
+            # Handle nested dataclasses
+            elif dataclasses.is_dataclass(value):
+                linear_configs.extend(_find_linear_configs_recursive(value))
+
+        return linear_configs
+
+    linear_layers = _find_linear_configs_recursive(model_config)
+
+    _linear_layer_to_quantized_weight(linear_layers)
 
     if model_config.medusa_heads is not None:
         linear_layers = []
diff --git a/modelopt/torch/export/postprocess.py b/modelopt/torch/export/postprocess.py
@@ -41,7 +41,6 @@
     ExpertConfig,
     LinearConfig,
     ModelConfig,
-    MOEConfig,
     RelativeAttentionTableConfig,
 )
 from .model_config_utils import pad_weights
@@ -99,17 +98,6 @@ def _split_model_config_for_tp(merged_config, split_factor):
         for i, config in enumerate(configs):
             config.weight = weights[i]
 
-    elif isinstance(merged_config, MOEConfig):
-        split_expert_configs = _split_model_config_for_tp(
-            merged_config.experts,
-            split_factor,
-        )
-        # TP for rounter of MoE is skipped for better performance
-        # See https://github.com/NVIDIA/TensorRT-LLM/pull/1091 for details
-        for i in range(split_factor):
-            configs[i].experts = split_expert_configs[i]
-            configs[i].router = merged_config.router
-
     elif isinstance(merged_config, ExpertConfig):
         assert merged_config.proj.linear_type != LINEAR_COLUMN  # row
         assert merged_config.fc.linear_type == LINEAR_COLUMN  # column
@@ -199,6 +187,10 @@ def _split_model_config_for_tp(merged_config, split_factor):
             "Do not support group linear TP merge or split"
         )
 
+        # Do not do anything if we don't need to process TP.
+        if not merged_config.tp:
+            return configs
+
         split_axis = 0 if merged_config.linear_type == LINEAR_COLUMN else 1
         if merged_config.linear_type == LINEAR_COLUMN:
             merged_config.weight = pad_weights(merged_config.weight, split_factor)
@@ -342,6 +334,10 @@ def _merge_model_configs_to_first_tp(config, ranks: list[int], group=None):
 
         assert config.linear_type != LINEAR_GROUP, "Do not support group linear TP merge or split"
 
+        # No merge is needed if tp is disabled.
+        if not config.tp:
+            return
+
         # Handling constants
         for field_name in [
             "activation_scaling_factor",
@@ -758,41 +754,48 @@ def check_weight_shape_valid(config, inference_tensor_parallel=1, training_tenso
     This function is recurisve.
     """
 
-    def _check_merged_weight(merged_k):
-        assert merged_k % inference_tensor_parallel == 0, (
-            f"Weights cannot be split into {inference_tensor_parallel} ranks."
-        )
+    def _check_merged_weight(merged_k, tp):
+        assert merged_k % tp == 0, f"Weights with shape {merged_k} cannot be split into {tp} ranks."
 
-    def _check_merged_weight_scaling_factor(merged_k, awq_block_size):
-        if awq_block_size > 0 and (merged_k // inference_tensor_parallel) % awq_block_size != 0:
+    def _check_merged_weight_scaling_factor(merged_k, tp, awq_block_size):
+        if awq_block_size > 0 and (merged_k // tp) % awq_block_size != 0:
             raise NotImplementedError(
-                "Weight shape is not divisible for block size for block quantization."
+                f"Weight shape {merged_k} of each TP tp={tp} "
+                f"is not divisible for block size {awq_block_size} for block quantization."
             )
 
-    def _check_merged_channel_is_valid(merged_k, awq_block_size):
-        _check_merged_weight(merged_k=merged_k)
-        _check_merged_weight_scaling_factor(merged_k=merged_k, awq_block_size=awq_block_size)
+    def _check_merged_channel_is_valid(merged_k, tp, awq_block_size):
+        _check_merged_weight(merged_k=merged_k, tp=tp)
+        _check_merged_weight_scaling_factor(merged_k=merged_k, tp=tp, awq_block_size=awq_block_size)
 
     if isinstance(config, LinearConfig):
         # check weight shape
+        if not config.tp:
+            inference_tensor_parallel = 1
         if config.linear_type == LINEAR_COLUMN:
             _, k = config.weight.shape
             merged_k = k * training_tensor_parallel
-            _check_merged_channel_is_valid(merged_k, config.awq_block_size)
+            _check_merged_channel_is_valid(
+                merged_k, tp=inference_tensor_parallel, awq_block_size=config.awq_block_size
+            )
         elif config.linear_type == LINEAR_ROW:
             k, m = config.weight.shape
             merged_k = k * training_tensor_parallel
             merged_m = m * training_tensor_parallel
             # For int4_awq, weight scaling factors will be split as (k, (merged_m // TP) // block_size)
-            _check_merged_weight(merged_k=merged_k)
-            _check_merged_weight_scaling_factor(merged_m, config.awq_block_size)
+            _check_merged_weight(merged_k=merged_k, tp=inference_tensor_parallel)
+            _check_merged_weight_scaling_factor(
+                merged_m, tp=inference_tensor_parallel, awq_block_size=config.awq_block_size
+            )
 
         return
 
     if isinstance(config, ExpertConfig):
         _, _, k = config.fc.weight.shape
         merged_k = k * training_tensor_parallel
-        _check_merged_channel_is_valid(merged_k, config.fc.awq_block_size)
+        _check_merged_channel_is_valid(
+            merged_k, tp=inference_tensor_parallel, awq_block_size=config.fc.awq_block_size
+        )
         return
 
     if is_dataclass(config):
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -164,6 +164,8 @@
     "*proj_out.*": {"enable": False},  # In Whisper model, lm_head has key name proj_out
     "*block_sparse_moe.gate*": {"enable": False},  # Skip the MOE router
     "*router*": {"enable": False},  # Skip the MOE router
+    "*mlp.gate.*": {"enable": False},  # Skip the MOE router
+    "*mlp.shared_expert_gate.*": {"enable": False},  # Skip the MOE router
     "*output_layer*": {"enable": False},
     "output.*": {"enable": False},
     "default": {"enable": False},