support for gguf mixed q2_k_s (#1059)

n1ck-guo · web-flow · commit e57555881a67 · 2025-11-26T09:18:08.000+08:00
* support for gguf mixed q2_k_s

Signed-off-by: n1ck-guo &lt;heng.guo@intel.com&gt;
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -55,7 +55,13 @@
 from auto_round.export.export_to_autoround import AutoRoundFormat
 from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG, ModelType
 from auto_round.logger import logger
-from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme
+from auto_round.schemes import (
+    SPECIAL_SCHEMES,
+    QuantizationScheme,
+    _handle_special_schemes,
+    get_gguf_scheme,
+    preset_name_to_scheme,
+)
 from auto_round.sign_sgd import SignSGD
 from auto_round.special_model_handler import _handle_moe_model
 from auto_round.utils import (
@@ -214,6 +220,33 @@ def __init__(
             ... }
         """
 
+        # Model related
+        model_dtype = kwargs.pop("model_dtype", None)
+        self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
+        self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
+        self.quantized = False
+        if isinstance(model, str):
+            model, tokenizer = llm_load_model(
+                model,
+                platform=platform,
+                device="cpu",  # always load cpu first
+                model_dtype=model_dtype,
+            )
+        elif tokenizer is None and not self.diffusion and iters > 0:
+            raise ValueError("A tokenizer must be set for non-str model input")
+        if unsupported_meta_device(model):
+            raise RuntimeError(
+                "AutoRound does not support parameters on meta device. "
+                "Please use more GPUs by setting `--device 0,1,2,3` or just place the model on CPU."
+            )
+        check_and_mark_fp8_model(model)
+        self.model = model.eval()
+        self.tokenizer = tokenizer
+        self.shared_cache_keys = get_shared_keys(self.model)
+
+        self.layer_config = layer_config
+
+        # should be set after loading model and set layer_config, cause some special scheme need these.
         self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(scheme, kwargs)
 
         gguf_scheme_name = get_gguf_scheme(self.scheme)
@@ -244,11 +277,8 @@ def __init__(
             platform = "model_scope"
         self.platform = platform
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
-        self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
-        self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
 
         self.fp_layers = kwargs.pop("fp_layers", "")
-        self.layer_config = layer_config
         self.supported_types = SUPPORTED_LAYER_TYPES
         self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
         self.scale_dtype = convert_dtype_str2torch(scale_dtype)
@@ -270,27 +300,6 @@ def __init__(
         else:
             torch.use_deterministic_algorithms(True, warn_only=True)
 
-        # Model related
-        self.quantized = False
-        if isinstance(model, str):
-            model, tokenizer = llm_load_model(
-                model,
-                platform=platform,
-                device="cpu",  # always load cpu first
-                model_dtype=model_dtype,
-            )
-        elif tokenizer is None and not self.diffusion and iters > 0:
-            raise ValueError("A tokenizer must be set for non-str model input")
-        if unsupported_meta_device(model):
-            raise RuntimeError(
-                "AutoRound does not support parameters on meta device. "
-                "Please use more GPUs by setting `--device 0,1,2,3` or just place the model on CPU."
-            )
-        check_and_mark_fp8_model(model)
-        self.model = model.eval()
-        self.tokenizer = tokenizer
-        self.shared_cache_keys = get_shared_keys(self.model)
-
         self.to_quant_block_names = to_quant_block_names
         if not hasattr(self, "quant_block_list"):
             all_blocks = get_block_names(model)
@@ -524,6 +533,8 @@ def _parse_and_set(scheme, kwargs):
                 scheme = scheme.strip("'\" ")
                 res = scheme
                 scheme = scheme.upper()
+                if scheme in SPECIAL_SCHEMES:
+                    self.layer_config = _handle_special_schemes(scheme, self.layer_config, self.model)
                 scheme = asdict(preset_name_to_scheme(scheme))
             scheme_keys = [f.name for f in fields(QuantizationScheme)]
             for key in scheme_keys:
@@ -776,6 +787,8 @@ def remove_duplicates(lst):
 
         if gguf_format_name:
             for i in range(len(formats)):
+                if gguf_format_name.lower().endswith("mixed"):
+                    gguf_format_name = gguf_format_name.lower().replace("_mixed", "_s")
                 if formats[i] != "fake" and formats[i] != gguf_format_name.lower():
                     logger.warning(
                         f"reset format {formats[i]} to {gguf_format_name.lower()} "
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
@@ -16,6 +16,8 @@
 from dataclasses import dataclass, fields
 from typing import Optional, Union
 
+import torch
+
 __all__ = ["QuantizationScheme", "get_gguf_scheme", "preset_name_to_scheme"]
 
 
@@ -265,6 +267,25 @@ def is_preset_scheme(name: str) -> bool:
     value.pop("lm_head", None)
     PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value)
 
+SPECIAL_SCHEMES = {"GGUF:Q2_K_MIXED": PRESET_SCHEMES["GGUF:Q2_K_S"]}
+PRESET_SCHEMES.update(SPECIAL_SCHEMES)
+
+
+def _handle_special_schemes(scheme_name: str, layer_config: dict, model: torch.nn.Module) -> dict:
+    """handle special schemes, like GGUF:Q2_K_MIXED.
+    Provide some special auto_round recipes.
+
+    """
+    if scheme_name == "GGUF:Q2_K_MIXED":
+        for n, m in model.named_modules():
+            if n in layer_config:
+                continue
+            if n == "lm_head" or isinstance(m, torch.nn.Embedding):
+                layer_config[n] = "GGUF:Q8_0"
+            elif isinstance(m, torch.nn.Linear) and ("expert" not in n or "shared_experts" in n) and n != "lm_head":
+                layer_config[n] = "GGUF:Q4_K_S"
+    return layer_config
+
 
 def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str:
     if isinstance(scheme, str) and scheme.upper().startswith("GGUF"):
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
@@ -309,6 +309,15 @@ def test_all_format(self):
                 assert False, "cmd line test fail, please have a check"
             shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
+        # test mixed q2_k_s
+        res = os.system(
+            f"cd ../.. && {python_path} -m auto_round --model {model_name}"
+            f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED"
+        )
+        if res > 0 or res == -1:
+            assert False, "cmd line test fail, please have a check"
+        shutil.rmtree("../../tmp_autoround", ignore_errors=True)
+
     def test_vlm_gguf(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct"
         from auto_round import AutoRoundMLLM