Add modules_to_not_convert and fix activation scale name

akaashrp · akaashrp · commit 0f7b7631c50b · 2025-12-26T07:13:18.000-05:00
diff --git a/python/mlc_llm/model/ministral3/ministral3_loader.py b/python/mlc_llm/model/ministral3/ministral3_loader.py
@@ -67,7 +67,7 @@ def huggingface(model_config: Ministral3Config, quantization: Quantization) -> E
         model = quantization.quantize_model(model, QuantizeMapping({}, {}), "")
         if model_config.weight_block_size is None:
             raise ValueError(
-                "The input DeepSeek model is not fp8 block quantized. "
+                "The input Ministral 3 model is not fp8 block quantized. "
                 "Thus BlockScaleQuantize is not supported."
             )
     
@@ -98,7 +98,7 @@ def hf(name: str) -> str:
         and model_config.weight_block_size is not None
     ):
         raise ValueError(
-            "The input DeepSeek model is fp8 block quantized. "
+            "The input Ministral 3 model is fp8 block quantized. "
             "Please use BlockScaleQuantize for the model."
         )
 
@@ -126,9 +126,9 @@ def add_weight_and_scale_mapping(
                     weight_scale_hf_names,
                     functools.partial(weight_transform_func, dtype=weight_scale_param.dtype),
                 )
-            activation_scale_mlc_name = f"{weight_mlc_name}_activation_scale"
+            activation_scale_mlc_name = f"{weight_mlc_name[: -len('.weight')]}.activation_scale"
             if activation_scale_mlc_name in named_parameters:
-                activation_scale_hf_names = [f"{name}_activation_scale" for name in weight_hf_names]
+                activation_scale_hf_names = [f"{name[: -len('.weight')]}.activation_scale" for name in weight_hf_names]
                 activation_scale_param = named_parameters[activation_scale_mlc_name]
                 transform = activation_transform_func or weight_transform_func
                 mapping.add_mapping(
@@ -140,9 +140,6 @@ def add_weight_and_scale_mapping(
     def identity_transform(param: np.ndarray, dtype: str):
         return param.astype(dtype)
 
-    def concat_along_dim0(*arrays: np.ndarray, dtype: str):
-        return np.concatenate(arrays, axis=0).astype(dtype)
-
     def make_shared_activation_transform(target_name: str):
         def func(first: np.ndarray, *rest: np.ndarray, dtype: str):
             for idx, arr in enumerate(rest, start=1):
diff --git a/python/mlc_llm/model/ministral3/ministral3_model.py b/python/mlc_llm/model/ministral3/ministral3_model.py
@@ -46,6 +46,7 @@ class Ministral3Config(ConfigBase):  # pylint: disable=too-many-instance-attribu
     tie_word_embeddings: bool = False
     weight_block_size: Optional[Tuple[int, int]] = None
     kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
+    modules_to_not_convert: Tuple[str, ...] = dataclasses.field(default_factory=tuple)
 
     @classmethod
     def from_dict(  # type: ignore[override]
@@ -72,6 +73,9 @@ def __post_init__(self):  # pylint: disable=too-many-branches
                 quant_method = quantization_config.get("quant_method", "")
                 fmt = quantization_config.get("fmt", "")
                 weight_block_size = quantization_config.get("weight_block_size")
+                modules_to_not_convert = quantization_config.get("modules_to_not_convert", [])
+                if isinstance(modules_to_not_convert, list):
+                    self.modules_to_not_convert = tuple(modules_to_not_convert)
                 if (
                     quant_method == "fp8"
                     and fmt == "e4m3"
@@ -317,6 +321,7 @@ def __init__(self, config: Ministral3Config):
         self.tie_word_embeddings = config.tie_word_embeddings
         if not config.tie_word_embeddings:
             self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # "vocab_size"
+        self._mark_modules_no_quant(config.modules_to_not_convert)
         self.num_hidden_layers = config.num_hidden_layers
         self.num_attention_heads = config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
@@ -330,6 +335,20 @@ def __init__(self, config: Ministral3Config):
         self.dtype = config.dtype
         self.weight_block_size = config.weight_block_size
 
+    def _mark_modules_no_quant(self, modules: Tuple[str, ...]):
+        for path in modules:
+            if not path:
+                continue
+            parts = path.split(".")
+            target = self
+            for part in parts:
+                if not hasattr(target, part):
+                    target = None
+                    break
+                target = getattr(target, part)
+            if target is not None:
+                setattr(target, "no_quantization", True)
+
     def to(self, dtype: Optional[str] = None):
         super().to(dtype=dtype)
         if dtype is not None:
diff --git a/python/mlc_llm/quantization/block_scale_quantization.py b/python/mlc_llm/quantization/block_scale_quantization.py
@@ -159,7 +159,7 @@ def visit_module(self, name: str, node: nn.Module) -> Any:
                     and not is_moe_gate(name, node)
                 ):
                     if self.config.use_activation_scale:
-                        return BlockScaleQuantizeLinearMinistral3.from_linear(
+                        return BlockScaleQuantizeLinearStaticActivation.from_linear(
                             node, self.config, weight_block_size
                         )
                     return BlockScaleQuantizeLinear.from_linear(
@@ -329,8 +329,8 @@ def to(self, dtype: Optional[str] = None) -> None:
             self.dtype = dtype  # pylint: disable=attribute-defined-outside-init
 
 
-class BlockScaleQuantizeLinearMinistral3(BlockScaleQuantizeLinear):
-    """Block-scale quantization for Ministral3 static activation FP8."""
+class BlockScaleQuantizeLinearStaticActivation(BlockScaleQuantizeLinear):
+    """Block-scale quantization for static activation FP8."""
 
     def __init__(  # pylint: disable=too-many-arguments
         self,
@@ -357,9 +357,9 @@ def __init__(  # pylint: disable=too-many-arguments
     @staticmethod
     def from_linear(
         src: nn.Linear, config: BlockScaleQuantize, weight_block_size: Optional[Tuple[int, int]]
-    ) -> "BlockScaleQuantizeLinearMinistral3":
+    ) -> "BlockScaleQuantizeLinearStaticActivation":
         """
-        Convert a non-quantized nn.Linear to a block-scale quantized BlockScaleQuantizeLinearMinistral3.
+        Convert a non-quantized nn.Linear to a block-scale quantized BlockScaleQuantizeLinearStaticActivation.
         
         Parameters
         ----------
@@ -374,12 +374,12 @@ def from_linear(
             
         Returns
         -------
-        ret : BlockScaleQuantizeLinearMinistral3
-            The block-scale quantized BlockScaleQuantizeLinearMinistral3
+        ret : BlockScaleQuantizeLinearStaticActivation
+            The block-scale quantized BlockScaleQuantizeLinearStaticActivation
         """
         assert weight_block_size is not None
         out_features, in_features = src.weight.shape
-        quantized_linear = BlockScaleQuantizeLinearMinistral3(
+        quantized_linear = BlockScaleQuantizeLinearStaticActivation(
             in_features=in_features,
             out_features=out_features,
             weight_dtype=config.weight_dtype,
diff --git a/python/mlc_llm/quantization/quantization.py b/python/mlc_llm/quantization/quantization.py
@@ -191,8 +191,8 @@ def quantize_weight(self, weight: tvm.runtime.Tensor) -> List[tvm.runtime.Tensor
         weight_dtype="float8_e4m3fn",
         model_dtype="bfloat16",
     ),
-    "fp8_e4m3fn_bf16_block_scale_ministral3": BlockScaleQuantize(
-        name="fp8_e4m3fn_bf16_block_scale_ministral3",
+    "fp8_e4m3fn_bf16_block_scale_static_activation": BlockScaleQuantize(
+        name="fp8_e4m3fn_bf16_block_scale_static_activation",
         kind="block-scale-quant",
         weight_dtype="float8_e4m3fn",
         model_dtype="bfloat16",