Qwen3 TRT-LLM/HF export (#188)

kevalmorabia97 · michaelfeil · kevalmorabia97 · commit cdc8996820fe · 2025-06-05T13:25:28.000-07:00
Co-authored-by: Michael Feil &lt;63565275+michaelfeil@users.noreply.github.com&gt;
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
@@ -324,6 +324,7 @@ def is_moe(module: nn.Module) -> bool:
         "PhimoeSparseMoeBlock".lower(),
         "DeepseekMoE".lower(),
         "Qwen2MoeSparseMoeBlock".lower(),
+        "Qwen3MoeSparseMoeBlock".lower(),
     ]
 
 
@@ -969,26 +970,75 @@ def get_stacked_scaling_factors(experts, get_function, module_name):
     return config
 
 
-@contextmanager
-def set_zero_amax_for_uncalibrated_experts(experts: nn.Module):
-    """For experts that does not have valid amax value of input quantizer, we set them to 0."""
+def get_expert_linear_names(module: nn.Module) -> list[str]:
+    """Get the list of linear names for the experts."""
+    if type(module).__name__.lower() in [
+        "Qwen2MoeSparseMoeBlock".lower(),
+        "Qwen3MoeSparseMoeBlock".lower(),
+        "DeepseekMoE".lower(),
+    ]:
+        return ["gate_proj", "down_proj", "up_proj"]
+    elif type(module).__name__.lower() in "MixtralMoeSparseMoeBlock".lower():
+        return ["linear_fc1", "linear_fc2"]
+    elif type(module).__name__.lower() in "DBRXMoeSparseMoeBlock".lower():
+        return ["w1_linear", "w2_linear", "v1_linear"]
+    else:
+        # assuing w1, w2, w3 by default
+        return ["w1", "w2", "w3"]
+
+
+def set_amax_for_uncalibrated_experts(experts: nn.Module, set_amax_value: float | None = None):
+    """Set amax of uncalibrated experts to a given value or the max of existing amax value from other experts.
+
+    Args:
+        experts: a list of experts
+        set_amax_value: set amax value to the given value.
+                        If None, set amax value to the max of existing amax value from other experts.
+
+    Returns:
+        uncalibrated_experts: a list of uncalibrated experts
+    """
     uncalibrated_experts = []
+    # get the max amax value from all experts
+    if set_amax_value is None:
+        amax_values = [
+            module.input_quantizer.amax
+            for module in experts
+            if (
+                hasattr(module, "input_quantizer")
+                and module.input_quantizer is not None
+                and module.input_quantizer.is_enabled
+            )
+            and module.input_quantizer.amax is not None
+        ]
+        if len(amax_values) == 0:
+            return uncalibrated_experts
+        set_amax_value = torch.max(torch.stack(amax_values))
+
     for module in experts:
         if (
             hasattr(module, "input_quantizer")
             and module.input_quantizer is not None
             and module.input_quantizer.is_enabled
         ) and module.input_quantizer.amax is None:
             warn(
-                f"Missing amax value for {module} input_quantizer. Setting it to 0 for checkpoint export. "
+                f"Missing amax value for {module} input_quantizer. Setting it to {set_amax_value} for export. "
                 f"This typically occurs in MoE models when certain experts are not activated during calibration. "
                 f"Consider increasing your calibration dataset size to ensure all experts are exercised."
             )
             # Use float32 dtype explicitly to ensure we create a floating point tensor
             module.input_quantizer.amax = torch.tensor(
-                0.0, dtype=torch.float32, device=module.weight_quantizer.amax.device
+                set_amax_value, dtype=torch.float32, device=module.weight_quantizer.amax.device
             )
             uncalibrated_experts.append(module)
+
+
+@contextmanager
+def set_amax_for_uncalibrated_experts_context(
+    experts: nn.Module, set_amax_value: float | None = None
+):
+    """Set amax for uncalibrated experts in a context manager."""
+    uncalibrated_experts = set_amax_for_uncalibrated_experts(experts, set_amax_value)
     yield
     if uncalibrated_experts:
         for module in uncalibrated_experts:
@@ -1022,12 +1072,13 @@ def build_stacked_experts(
     )
 
     # Set amax to 0 for uncalibrated experts
-    with set_zero_amax_for_uncalibrated_experts(
+    with set_amax_for_uncalibrated_experts_context(
         [
             expert_getter(experts, i, module_name)
             for module_name in linear_names
             for i in range(num_experts)
-        ]
+        ],
+        0,  # set amax to 0 for uncalibrated experts as we will calculate max across all experts later
     ):
         # Pre-fuse W1 and W3
         if len(linear_names) == 3:
@@ -1121,12 +1172,14 @@ def build_moe_config(module: nn.Module, decoder_type) -> MOEConfig:
         )
     elif decoder_type == "qwen":
         config.router = build_linear_config(module.gate, LINEAR_ROW)
-        preprocess_linear_fusion([module.shared_expert.gate_proj, module.shared_expert.up_proj])
-        config.shared_expert = build_mlp_config(
-            module.shared_expert, decoder_type, merge_gate_fc=True
-        )
-        config.shared_expert_gate = build_linear_config(module.shared_expert_gate, LINEAR_ROW)
-        config.shared_expert_gate.tp = False
+        # Qwen3 doesn't have shared expert
+        if hasattr(module, "shared_expert"):
+            preprocess_linear_fusion([module.shared_expert.gate_proj, module.shared_expert.up_proj])
+            config.shared_expert = build_mlp_config(
+                module.shared_expert, decoder_type, merge_gate_fc=True
+            )
+            config.shared_expert_gate = build_linear_config(module.shared_expert_gate, LINEAR_ROW)
+            config.shared_expert_gate.tp = False
     else:
         raise NotImplementedError(f"{decoder_type} not supported")
 
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -15,6 +15,7 @@
 
 """Code that export quantized Hugging Face models for deployment."""
 
+import collections.abc
 import json
 import tempfile
 import warnings
@@ -29,7 +30,14 @@
 from modelopt.torch.quantization.nn import SequentialQuantizer
 
 from .convert_hf_config import convert_hf_quant_config_format
-from .layer_utils import get_experts_list, is_layernorm, is_moe, is_quantlinear
+from .layer_utils import (
+    get_expert_linear_names,
+    get_experts_list,
+    is_layernorm,
+    is_moe,
+    is_quantlinear,
+    set_amax_for_uncalibrated_experts,
+)
 from .model_config import (
     KV_CACHE_FP8,
     KV_CACHE_NVFP4,
@@ -184,6 +192,44 @@ def _export_hf_checkpoint(
     root = getattr(root, "layers", root)
     layer_pool = {f"model.layers.{name}": sub_module for name, sub_module in root.named_modules()}
 
+    # Handle input quantizers of experts that are not calibrated
+    for name, sub_module in model.named_modules():
+        if is_moe(sub_module) and hasattr(sub_module, "experts"):
+            expert_linear_names = get_expert_linear_names(sub_module)
+            for linear_name in expert_linear_names:
+                # Handle DBRX experts specifically
+                if "QuantDbrxExperts" in type(sub_module.experts).__name__:
+                    # For DBRX, experts are in sub_module.experts.mlp and linear layers are ModuleLists
+                    experts_mlp = sub_module.experts.mlp
+                    if hasattr(experts_mlp, linear_name):
+                        linear_modulelist = getattr(experts_mlp, linear_name)
+                        if hasattr(linear_modulelist, "__iter__"):
+                            set_amax_for_uncalibrated_experts(list(linear_modulelist))
+                elif isinstance(sub_module.experts, collections.abc.Iterable):
+                    # For other MoE models (like Mixtral) with iterable experts
+                    try:
+                        set_amax_for_uncalibrated_experts(
+                            [getattr(expert, linear_name) for expert in sub_module.experts]
+                        )
+                    except AttributeError as e:
+                        # Provide more helpful debugging information
+                        expert_types = [type(expert).__name__ for expert in sub_module.experts]
+                        raise AttributeError(
+                            f"Failed to access attribute '{linear_name}' on experts. "
+                            f"MoE module type: {type(sub_module).__name__}, "
+                            f"Expert types: {expert_types}, "
+                            f"Expected linear names: {expert_linear_names}. "
+                            f"This suggests the get_expert_linear_names function may need "
+                            f"to be updated for this model architecture. "
+                            f"Original error: {e}"
+                        ) from e
+                else:
+                    # Unsupported MoE model structure
+                    raise NotImplementedError(
+                        f"MoE model with experts type '{type(sub_module.experts).__name__}' is not supported in export."
+                        f"Please file an issue or add support for this model architecture."
+                    )
+
     # NOTE: Speculative decoding models have extra modules that may be quantized
     # Need to add these modules to the layer_pool
     for key in SPECULATIVE_DECODING_MODULE_NAMES: