Update scale transforms

akaashrp · akaashrp · commit e51af6189f3d · 2025-12-31T03:04:38.000-05:00
diff --git a/python/mlc_llm/conversation_template/__init__.py b/python/mlc_llm/conversation_template/__init__.py
@@ -17,6 +17,7 @@
     hermes,
     llama,
     llava,
+    ministral3,
     mistral,
     nemotron,
     oasst,
diff --git a/python/mlc_llm/model/ministral3/ministral3_loader.py b/python/mlc_llm/model/ministral3/ministral3_loader.py
@@ -121,20 +121,76 @@ def add_weight_and_scale_mapping(
             if weight_scale_mlc_name in named_parameters:
                 weight_scale_hf_names = [f"{name}_scale_inv" for name in weight_hf_names]
                 weight_scale_param = named_parameters[weight_scale_mlc_name]
+                expected_weight_scale_shape = tuple(int(dim) for dim in weight_scale_param.shape)
+
+                def _weight_scale_transform(*arrays, dtype: str, _transform=weight_transform_func):
+                    processed = []
+                    for arr in arrays:
+                        arr_np = np.asarray(arr)
+                        if arr_np.ndim == 0:
+                            arr_np = arr_np.reshape((1,))
+                        processed.append(arr_np)
+                    result = _transform(*processed, dtype=dtype)
+                    result = np.asarray(result, dtype=dtype)
+                    if result.shape == expected_weight_scale_shape:
+                        return result
+                    if result.shape == ():
+                        return np.full(expected_weight_scale_shape, result.item(), dtype=dtype)
+                    if result.shape == (1,) and expected_weight_scale_shape != (1,):
+                        return np.broadcast_to(result, expected_weight_scale_shape).astype(dtype)
+                    if (
+                        result.ndim == 1
+                        and result.size > 1
+                        and len(expected_weight_scale_shape) >= 2
+                        and expected_weight_scale_shape[0] % result.size == 0
+                    ):
+                        rows_per_segment = expected_weight_scale_shape[0] // result.size
+                        tiled = np.repeat(result, rows_per_segment)
+                        tiled = tiled.reshape(expected_weight_scale_shape[0], 1)
+                        return np.broadcast_to(tiled, expected_weight_scale_shape).astype(dtype)
+                    raise ValueError(
+                        f"Unexpected weight scale shape {result.shape} for "
+                        f"{weight_scale_mlc_name}, expected {expected_weight_scale_shape}"
+                    )
                 mapping.add_mapping(
                     weight_scale_mlc_name,
                     weight_scale_hf_names,
-                    functools.partial(weight_transform_func, dtype=weight_scale_param.dtype),
+                    functools.partial(_weight_scale_transform, dtype=weight_scale_param.dtype),
                 )
             activation_scale_mlc_name = f"{weight_mlc_name[: -len('.weight')]}.activation_scale"
             if activation_scale_mlc_name in named_parameters:
                 activation_scale_hf_names = [f"{name[: -len('.weight')]}.activation_scale" for name in weight_hf_names]
                 activation_scale_param = named_parameters[activation_scale_mlc_name]
                 transform = activation_transform_func or weight_transform_func
+                expected_shape = tuple(int(dim) for dim in activation_scale_param.shape)
+
+                def _activation_scale_transform(*arrays, dtype: str, _transform=transform):
+                    result = _transform(*arrays, dtype=dtype)
+                    result = np.asarray(result, dtype=dtype)
+                    if result.shape == expected_shape:
+                        return result
+                    if result.shape == ():
+                        # HF checkpoint stores a single scale; broadcast across the expected dimension.
+                        return np.full(expected_shape, result.item(), dtype=dtype)
+                    if result.shape == (1,) and expected_shape != (1,):
+                        return np.broadcast_to(result, expected_shape).astype(dtype)
+                    if (
+                        result.ndim == 1
+                        and result.size > 1
+                        and len(expected_shape) >= 1
+                        and expected_shape[0] % result.size == 0
+                    ):
+                        rows_per_segment = expected_shape[0] // result.size
+                        tiled = np.repeat(result, rows_per_segment)
+                        return tiled.reshape(expected_shape).astype(dtype)
+                    raise ValueError(
+                        f"Unexpected activation scale shape {result.shape} for "
+                        f"{activation_scale_mlc_name}, expected {expected_shape}"
+                    )
                 mapping.add_mapping(
                     activation_scale_mlc_name,
                     activation_scale_hf_names,
-                    functools.partial(transform, dtype=activation_scale_param.dtype),
+                    functools.partial(_activation_scale_transform, dtype=activation_scale_param.dtype),
                 )
 
     def identity_transform(param: np.ndarray, dtype: str):
diff --git a/python/mlc_llm/model/ministral3/ministral3_model.py b/python/mlc_llm/model/ministral3/ministral3_model.py
@@ -71,14 +71,12 @@ def __post_init__(self):  # pylint: disable=too-many-branches
             if isinstance(quantization_config, dict):
                 activation_scheme = quantization_config.get("activation_scheme", "")
                 quant_method = quantization_config.get("quant_method", "")
-                fmt = quantization_config.get("fmt", "")
                 weight_block_size = quantization_config.get("weight_block_size")
                 modules_to_not_convert = quantization_config.get("modules_to_not_convert", [])
                 if isinstance(modules_to_not_convert, list):
                     self.modules_to_not_convert = tuple(modules_to_not_convert)
                 if (
                     quant_method == "fp8"
-                    and fmt == "e4m3"
                     and activation_scheme == "static"
                     and weight_block_size is not None
                 ):
@@ -95,16 +93,17 @@ def __post_init__(self):  # pylint: disable=too-many-branches
                 else:
                     self.weight_block_size = [128, 128]
                     logger.info(
-                        "Setting default weight_block_size since quantization_config does not provide "
-                        "FP8 block-scale details required by MLC (activation_scheme=%s, quant_method=%s, "
-                        "fmt=%s, weight_block_size=%s)",
+                        "Setting default weight_block_size=%s since quantization_config does not provide "
+                        "FP8 block-scale details required by MLC (activation_scheme=%s, quant_method=%s)",
+                        self.weight_block_size,
                         activation_scheme,
                         quant_method,
-                        fmt,
-                        weight_block_size,
                     )
             else:
-                logger.info("Ignoring non-dict quantization_config: %s", quantization_config)
+                raise ValueError(
+                    "Invalid Ministral 3 model quantization config: unrecognized quantization config: "
+                    f"{quantization_config}"
+                )
         
         if self.position_embedding_base == 0:
             if self.rope_parameters is not None and "rope_theta" in self.rope_parameters:
diff --git a/python/mlc_llm/model/model.py b/python/mlc_llm/model/model.py
@@ -145,7 +145,6 @@ class Model:
         source={
             "huggingface-torch": ministral3_loader.huggingface,
             "huggingface-safetensor": ministral3_loader.huggingface,
-            "awq": ministral3_loader.awq,
         },
         quantize={
             "group-quant": ministral3_quantization.group_quant,