From 6ffcf60c1c32c28f71fb45fd6eaa7fb50657d076 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 25 Sep 2025 14:14:42 +0800
Subject: [PATCH 01/35] try to enable auto_scheme API

---
 auto_round/__init__.py              |  3 +-
 auto_round/__main__.py              |  9 ++++
 auto_round/auto_schemes/__init__.py | 24 +++++++++
 auto_round/autoround.py             |  5 +-
 auto_round/compressors/base.py      | 76 +++++++++++++++--------------
 auto_round/data_type/register.py    |  3 +-
 auto_round/schemes.py               | 29 +++++++++--
 7 files changed, 103 insertions(+), 46 deletions(-)
 create mode 100644 auto_round/auto_schemes/__init__.py

diff --git a/auto_round/__init__.py b/auto_round/__init__.py
index 15bbc373d..1ce7d5e1e 100644
--- a/auto_round/__init__.py
+++ b/auto_round/__init__.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 from auto_round.autoround import AutoRound
 
-# support for old api
 from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam
+from auto_round.schemes import QuantizationScheme, AutoScheme
 from auto_round.utils import LazyImport
 
-
 def __getattr__(name):
     if name == "AutoHfQuantizer":
         from auto_round.inference.auto_quantizer import AutoHfQuantizer
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 78a8fc9d6..5c77c2f10 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -466,6 +466,13 @@ def tune(args):
     extra_config.tuning_config = tuning_config
     extra_config.scheme_config = scheme_config
     extra_config.mllm_config = mllm_config
+    layer_config = {}
+    from auto_round.auto_schemes.delta_loss import get_mixed_config_layer_config
+    best_path = get_mixed_config_layer_config(model_name,target_bits=6)
+    for item in best_path:
+        layer_config[item[0]] = {}
+        layer_config[item[0]]["bits"] = item[1]
+        layer_config[item[0]]["act_bits"] = item[1]
 
     autoround: BaseCompressor = AutoRound(
         model=model_name,
@@ -484,6 +491,8 @@ def tune(args):
         not_use_best_mse=args.not_use_best_mse,
         enable_adam=args.adam,
         extra_config=extra_config,
+        layer_config=layer_config
+
     )
 
     model_name = args.model.rstrip("/")
diff --git a/auto_round/auto_schemes/__init__.py b/auto_round/auto_schemes/__init__.py
new file mode 100644
index 000000000..38d40e023
--- /dev/null
+++ b/auto_round/auto_schemes/__init__.py
@@ -0,0 +1,24 @@
+AUTO_SCHEMES_ALGS = {}
+
+def register_dtype(names):
+    """Class decorator to register a mixed precision algorithm to the registry.
+
+    Decorator function used before a Pattern subclass.
+
+    Args:
+        names: A string. Define the export type.
+
+    Returns:
+        cls: The class of register.
+    """
+
+    def register(alg):
+        if isinstance(names, (tuple, list)):
+            for name in names:
+                AUTO_SCHEMES_ALGS[name] = alg
+        else:
+            AUTO_SCHEMES_ALGS[names] = alg
+
+        return alg
+
+    return register
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 4074213a9..22d3dc29b 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -25,7 +25,7 @@
     MLLMCompressor,
 )
 from auto_round.logger import deprecated, logger
-from auto_round.schemes import QuantizationScheme
+from auto_round.schemes import QuantizationScheme, AutoScheme
 from auto_round.utils import is_mllm_model
 
 
@@ -64,6 +64,7 @@ def __new__(
         model: Union[torch.nn.Module, str],
         tokenizer=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        auto_scheme: AutoScheme = None,
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         iters: int = 200,
@@ -77,7 +78,6 @@ def __new__(
         seed: int = 42,
         # for adam
         enable_adam: bool = False,
-        # for MLLM
         extra_config: ExtraConfig = None,
         **kwargs,
     ) -> BaseCompressor:
@@ -159,6 +159,7 @@ def __new__(
             model=model,
             tokenizer=tokenizer,
             scheme=scheme,
+            auto_scheme=auto_scheme,
             layer_config=layer_config,
             dataset=dataset,
             iters=iters,
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 01546034d..fa7ee0bf8 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -35,7 +35,7 @@
 from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType
 from auto_round.logger import logger
 from auto_round.low_cpu_mem.utils import get_layers_before_block
-from auto_round.schemes import QuantizationScheme, preset_name_to_scheme
+from auto_round.schemes import QuantizationScheme, preset_name_to_scheme, AutoScheme
 from auto_round.sign_sgd import SignSGD
 from auto_round.special_model_handler import _handle_moe_model
 from auto_round.utils import (
@@ -130,6 +130,7 @@ def __init__(
         model: Union[torch.nn.Module, str],
         tokenizer=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        auto_scheme: AutoScheme = None,
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         iters: int = 200,
@@ -204,7 +205,6 @@ def __init__(
         """
         self.scheme = None
         self._parse_and_set_scheme(scheme, kwargs)
-
         # Extra/legacy kwargs for backward compatibility
         # Major version releases may pack them with extra configuration options
         amp = kwargs.pop("amp", True)
@@ -237,7 +237,7 @@ def __init__(
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
         if "CUBLAS_WORKSPACE_CONFIG" not in os.environ:
             os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-        # deprecated, default not to use torch.use_deterministic_algorithms
+        # Deprecated, default not to use torch.use_deterministic_algorithms
         if not disable_deterministic_algorithms or enable_deterministic_algorithms:
             if not disable_deterministic_algorithms:
                 logger.warning(
@@ -255,26 +255,14 @@ def __init__(
         if device_map is None:
             device_map = 0
 
-        # Set device, must place after model loading
-        self._set_device(device_map)
-
-        if (isinstance(device_map, dict) and device_map) or device_map == "auto":
-            self.device_map = device_map
-        elif isinstance(device_map, str) and "," in device_map:
-            device_map = device_map.replace(" ", "")  # Remove any spaces
-            self.device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()]
-            self.device_map = "auto"
-        else:
-            self.device_map = None
-        self._set_device_map_in_blocks(self.device_map)
 
         # Model related
         self.quantized = False
         if isinstance(model, str):
             model, tokenizer, low_cpu_mem_usage = llm_load_model(
                 model,
-                device="cpu",
-                low_cpu_mem_mode=low_cpu_mem_usage,  # always load cpu first
+                device="cpu", # always load cpu first
+                low_cpu_mem_mode=low_cpu_mem_usage,
             )
         elif tokenizer is None and iters > 0:
             raise ValueError("A tokenizer must be set for non-str model input")
@@ -289,17 +277,23 @@ def __init__(
         self.tokenizer = tokenizer
         self.shared_cache_keys = get_shared_keys(self.model)
 
-        not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers)
-        if len(not_quantize_layer_names) > 0:
-            logger.info(f"{not_quantize_layer_names} will not be quantized.")
-        if layer_config is None:
-            layer_config = {}
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"}
-        self._parse_layer_config(layer_config)  # must place after model init
+        self._parse_layer_config(layer_config, fp_layers)  # must place after model init
 
         self.to_quant_block_names = to_quant_block_names
 
+        # Set device, must place after model loading
+        self._set_device(device_map)
+
+        if (isinstance(device_map, dict) and device_map) or device_map == "auto":
+            self.device_map = device_map
+        elif isinstance(device_map, str) and "," in device_map:
+            device_map = device_map.replace(" ", "")  # Remove any spaces
+            self.device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()]
+            self.device_map = "auto"
+        else:
+            self.device_map = None
+        self._set_device_map_in_blocks(self.device_map)
+
         # Tuning hyperparameters
         self.seed = seed
         set_seed(self.seed)
@@ -385,7 +379,7 @@ def __init__(
             import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
             import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401]
 
-    def _set_device(self, device_map):
+    def _set_device(self, device_map:Union[str, torch.device, int,dict])->None:
         if hasattr(self, "device") and self.device is not None:
             return
         if isinstance(device_map, (str, torch.device, int)):
@@ -409,8 +403,16 @@ def _set_device(self, device_map):
         else:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
-    def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]]) -> None:
+    def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers) -> None:
         """Parse and set the layer-wise quantization configuration."""
+        not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers)
+        if len(not_quantize_layer_names) > 0:
+            logger.info(f"{not_quantize_layer_names} will not be quantized.")
+        if layer_config is None:
+            layer_config = {}
+        for name in not_quantize_layer_names:
+            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"}
+
         # Some other quantization configs
         self.layer_config = {} if layer_config is None else layer_config
         scheme_keys = [f.name for f in fields(QuantizationScheme)]
@@ -1709,7 +1711,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         # It is best to modify the model structure in the quantize function and check the format,
         # because it may cause the gguf format to not be exported normally.
         self.model = _handle_moe_model(self.model, formats=formats)
-        self.has_qlayer_outside_block = self._set_layerwise_config(self.layer_config)
+        self.has_qlayer_outside_block = self._set_layerwise_config(model, self.layer_config)
         if not hasattr(self, "formats"):
             logger.warning("this API is deprecated, please use `quantize_and_save` instead")
         else:
@@ -1935,7 +1937,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
             del layer_input
             clear_memory(q_layer_input)
 
-    def _set_layerwise_config(self, layer_config: dict) -> bool:
+    def _set_layerwise_config(self, model:torch.nn.Module, layer_config: dict) -> bool:
         """
         Sets the layer-wise configuration based on the provided `layer_config`.
         By default, only quantize layers in blocks.
@@ -1950,14 +1952,14 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
         # Get the names of layers in quantization blocks
         supported_types = self.supported_types
         layers_in_blocks = get_layer_names_in_block(
-            self.model, supported_types, self.quant_block_list, self.inner_supported_types
+            model, supported_types, self.quant_block_list, self.inner_supported_types
         )
-        ##process regex in layer_config
+        # Process regex in layer_config
         all_supported_layer_names = []
         # List of configuration keys
         keys = get_quant_keys()
 
-        for n, m in self.model.named_modules():
+        for n, m in model.named_modules():
             # Delete previous configuration to avoid conflicts with prior tuning
             for key in keys:
                 if hasattr(m, key):
@@ -1981,7 +1983,7 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
                 for match_name in matched_names:
                     layer_config[match_name] = val
             else:
-                tmp_m = get_module(self.model, name)
+                tmp_m = get_module(model, name)
                 if not isinstance(tmp_m, torch.nn.Embedding):  # TODO not good code style
                     raise ValueError(f"key {name} in layer_config is invalid, please have a double check")
 
@@ -1989,17 +1991,17 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
 
         # Iterate through all modules in the model
         is_gguf = hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats)
-        for n, m in self.model.named_modules():
+        for n, m in model.named_modules():
             # Skip unsupported types
             if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types:
-                if n in self.layer_config:
+                if n in layer_config:
                     if not isinstance(m, torch.nn.Embedding):
                         logger.warning(f"{n} is not supported, layer_config {n}: {layer_config[n]} will be ignored.")
-                        self.layer_config.pop(n)
+                        layer_config.pop(n)
                         continue
                     if not is_gguf:
                         if not check_to_quantized(layer_config[n]):
-                            self.layer_config.pop(n)
+                            layer_config.pop(n)
                             continue
                 else:
                     continue
diff --git a/auto_round/data_type/register.py b/auto_round/data_type/register.py
index 12c4406a4..fca259ed6 100644
--- a/auto_round/data_type/register.py
+++ b/auto_round/data_type/register.py
@@ -22,8 +22,7 @@ def register_dtype(names):
     Decorator function used before a Pattern subclass.
 
     Args:
-        cls (class): The subclass of register.
-        name: A string. Define the export type.
+        names: A string. Define the export type.
 
     Returns:
         cls: The class of register.
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index a5c5975c9..7b6cf2f4d 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -14,9 +14,9 @@
 import copy
 from copy import deepcopy
 from dataclasses import dataclass, fields
-from typing import Generator, List, Optional
+from typing import Optional, Iterable
 
-__all__ = ["QuantizationScheme", "preset_name_to_scheme"]
+__all__ = ["QuantizationScheme", "preset_name_to_scheme", "AutoScheme"]
 
 
 @dataclass
@@ -38,7 +38,7 @@ def from_dict(cls, config: dict):
         return cls(**config)
 
     @classmethod
-    def get_attributes(cls: "QuantizationScheme") -> List[str]:
+    def get_attributes(cls: "QuantizationScheme") -> list[str]:
         return [field.name for field in fields(cls)]
 
     def __getitem__(self, key: str):
@@ -180,6 +180,8 @@ def is_preset_scheme(name: str) -> bool:
     }
 )
 
+
+
 # FP8 = asdict(QuantArgs.from_dict({
 #     "bits": 8,
 #     "group_size": 128,
@@ -201,6 +203,18 @@ def is_preset_scheme(name: str) -> bool:
     }
 )
 
+# For AutoScheme 16 bits options
+BF16 = QuantizationScheme.from_dict(
+    {
+        "bits": 16,
+        "group_size": 0,
+        "data_type": "fp",
+        "act_bits": 16,
+        "act_data_type": "fp",
+    }
+)
+
+
 PRESET_SCHEMES = {
     "W4A16": W4A16,
     "W2A16": W2A16,
@@ -211,6 +225,7 @@ def is_preset_scheme(name: str) -> bool:
     "NVFP4": NVFP4,
     "FPW8A16": FPW8A16,
     "FP8_STATIC": FP8_STATIC,
+    "BF16": BF16,
 }
 from auto_round.export.export_to_gguf.config import GGUF_CONFIG
 
@@ -220,3 +235,11 @@ def is_preset_scheme(name: str) -> bool:
     value.pop("embedding", None)
     value.pop("lm_head", None)
     PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value)
+
+
+@dataclass
+class AutoScheme:
+    options:Optional[Iterable[QuantizationScheme]]
+    target_bits:float
+    shared_layers:Optional[Iterable[Iterable[str]]]=None
+    method:str="naive_pre"
\ No newline at end of file

From 5d80825baa9643790b1ada4061d32818ec82bb04 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Sep 2025 06:15:51 +0000
Subject: [PATCH 02/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/__init__.py              |  1 +
 auto_round/__main__.py              |  6 +++---
 auto_round/auto_schemes/__init__.py | 15 +++++++++++++++
 auto_round/autoround.py             |  2 +-
 auto_round/compressors/base.py      |  9 ++++-----
 auto_round/schemes.py               | 11 +++++------
 6 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/auto_round/__init__.py b/auto_round/__init__.py
index 1ce7d5e1e..d7be4984c 100644
--- a/auto_round/__init__.py
+++ b/auto_round/__init__.py
@@ -17,6 +17,7 @@
 from auto_round.schemes import QuantizationScheme, AutoScheme
 from auto_round.utils import LazyImport
 
+
 def __getattr__(name):
     if name == "AutoHfQuantizer":
         from auto_round.inference.auto_quantizer import AutoHfQuantizer
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 5c77c2f10..a69359db8 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -468,7 +468,8 @@ def tune(args):
     extra_config.mllm_config = mllm_config
     layer_config = {}
     from auto_round.auto_schemes.delta_loss import get_mixed_config_layer_config
-    best_path = get_mixed_config_layer_config(model_name,target_bits=6)
+
+    best_path = get_mixed_config_layer_config(model_name, target_bits=6)
     for item in best_path:
         layer_config[item[0]] = {}
         layer_config[item[0]]["bits"] = item[1]
@@ -491,8 +492,7 @@ def tune(args):
         not_use_best_mse=args.not_use_best_mse,
         enable_adam=args.adam,
         extra_config=extra_config,
-        layer_config=layer_config
-
+        layer_config=layer_config,
     )
 
     model_name = args.model.rstrip("/")
diff --git a/auto_round/auto_schemes/__init__.py b/auto_round/auto_schemes/__init__.py
index 38d40e023..d3b055be2 100644
--- a/auto_round/auto_schemes/__init__.py
+++ b/auto_round/auto_schemes/__init__.py
@@ -1,5 +1,20 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 AUTO_SCHEMES_ALGS = {}
 
+
 def register_dtype(names):
     """Class decorator to register a mixed precision algorithm to the registry.
 
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 22d3dc29b..ae1a37677 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -25,7 +25,7 @@
     MLLMCompressor,
 )
 from auto_round.logger import deprecated, logger
-from auto_round.schemes import QuantizationScheme, AutoScheme
+from auto_round.schemes import AutoScheme, QuantizationScheme
 from auto_round.utils import is_mllm_model
 
 
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index fa7ee0bf8..72ca17ddc 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -35,7 +35,7 @@
 from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType
 from auto_round.logger import logger
 from auto_round.low_cpu_mem.utils import get_layers_before_block
-from auto_round.schemes import QuantizationScheme, preset_name_to_scheme, AutoScheme
+from auto_round.schemes import AutoScheme, QuantizationScheme, preset_name_to_scheme
 from auto_round.sign_sgd import SignSGD
 from auto_round.special_model_handler import _handle_moe_model
 from auto_round.utils import (
@@ -255,13 +255,12 @@ def __init__(
         if device_map is None:
             device_map = 0
 
-
         # Model related
         self.quantized = False
         if isinstance(model, str):
             model, tokenizer, low_cpu_mem_usage = llm_load_model(
                 model,
-                device="cpu", # always load cpu first
+                device="cpu",  # always load cpu first
                 low_cpu_mem_mode=low_cpu_mem_usage,
             )
         elif tokenizer is None and iters > 0:
@@ -379,7 +378,7 @@ def __init__(
             import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
             import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401]
 
-    def _set_device(self, device_map:Union[str, torch.device, int,dict])->None:
+    def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
         if hasattr(self, "device") and self.device is not None:
             return
         if isinstance(device_map, (str, torch.device, int)):
@@ -1937,7 +1936,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
             del layer_input
             clear_memory(q_layer_input)
 
-    def _set_layerwise_config(self, model:torch.nn.Module, layer_config: dict) -> bool:
+    def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> bool:
         """
         Sets the layer-wise configuration based on the provided `layer_config`.
         By default, only quantize layers in blocks.
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index 7b6cf2f4d..af51a881e 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -14,7 +14,7 @@
 import copy
 from copy import deepcopy
 from dataclasses import dataclass, fields
-from typing import Optional, Iterable
+from typing import Iterable, Optional
 
 __all__ = ["QuantizationScheme", "preset_name_to_scheme", "AutoScheme"]
 
@@ -181,7 +181,6 @@ def is_preset_scheme(name: str) -> bool:
 )
 
 
-
 # FP8 = asdict(QuantArgs.from_dict({
 #     "bits": 8,
 #     "group_size": 128,
@@ -239,7 +238,7 @@ def is_preset_scheme(name: str) -> bool:
 
 @dataclass
 class AutoScheme:
-    options:Optional[Iterable[QuantizationScheme]]
-    target_bits:float
-    shared_layers:Optional[Iterable[Iterable[str]]]=None
-    method:str="naive_pre"
\ No newline at end of file
+    options: Optional[Iterable[QuantizationScheme]]
+    target_bits: float
+    shared_layers: Optional[Iterable[Iterable[str]]] = None
+    method: str = "naive_pre"

From a4ef4950ad11a523e6c2679384c00b5b4ceadaf6 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 25 Sep 2025 14:19:25 +0800
Subject: [PATCH 03/35] update a little

---
 auto_round/__main__.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index a69359db8..adafd095e 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -466,14 +466,6 @@ def tune(args):
     extra_config.tuning_config = tuning_config
     extra_config.scheme_config = scheme_config
     extra_config.mllm_config = mllm_config
-    layer_config = {}
-    from auto_round.auto_schemes.delta_loss import get_mixed_config_layer_config
-
-    best_path = get_mixed_config_layer_config(model_name, target_bits=6)
-    for item in best_path:
-        layer_config[item[0]] = {}
-        layer_config[item[0]]["bits"] = item[1]
-        layer_config[item[0]]["act_bits"] = item[1]
 
     autoround: BaseCompressor = AutoRound(
         model=model_name,
@@ -491,8 +483,7 @@ def tune(args):
         fp_layers=args.fp_layers,
         not_use_best_mse=args.not_use_best_mse,
         enable_adam=args.adam,
-        extra_config=extra_config,
-        layer_config=layer_config,
+        extra_config=extra_config
     )
 
     model_name = args.model.rstrip("/")

From 4173c3eb7626b8509896d49c30958029ced1864e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Sep 2025 06:20:01 +0000
Subject: [PATCH 04/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index adafd095e..78a8fc9d6 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -483,7 +483,7 @@ def tune(args):
         fp_layers=args.fp_layers,
         not_use_best_mse=args.not_use_best_mse,
         enable_adam=args.adam,
-        extra_config=extra_config
+        extra_config=extra_config,
     )
 
     model_name = args.model.rstrip("/")

From 87e945407d7a09cb083a387be574681eee3a1ce0 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 25 Sep 2025 17:04:32 +0800
Subject: [PATCH 05/35] update a little

---
 auto_round/compressors/base.py |  5 ++---
 auto_round/utils.py            | 18 ------------------
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 72ca17ddc..f08540207 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -75,7 +75,6 @@
     get_lm_head_name,
     get_max_vram,
     get_module,
-    get_quant_keys,
     get_shared_keys,
     htcore,
     infer_bits_by_data_type,
@@ -1710,7 +1709,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         # It is best to modify the model structure in the quantize function and check the format,
         # because it may cause the gguf format to not be exported normally.
         self.model = _handle_moe_model(self.model, formats=formats)
-        self.has_qlayer_outside_block = self._set_layerwise_config(model, self.layer_config)
+        self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config)
         if not hasattr(self, "formats"):
             logger.warning("this API is deprecated, please use `quantize_and_save` instead")
         else:
@@ -1956,7 +1955,7 @@ def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> b
         # Process regex in layer_config
         all_supported_layer_names = []
         # List of configuration keys
-        keys = get_quant_keys()
+        keys = [f.name for f in fields(QuantizationScheme)]
 
         for n, m in model.named_modules():
             # Delete previous configuration to avoid conflicts with prior tuning
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 9af09758e..131a91db3 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2308,24 +2308,6 @@ def convert_fp8_model_to_16b_model(model, dtype=torch.bfloat16):
     return model
 
 
-def get_quant_keys():
-    keys = [
-        "bits",
-        "group_size",
-        "sym",
-        "data_type",
-        "scale_dtype",
-        "act_bits",
-        "act_group_size",
-        "act_sym",
-        "act_dynamic",
-        "act_data_type",
-        "super_bits",
-        "super_group_size",
-    ]
-    return keys
-
-
 def out_of_vram(error_msg):
     error_msg = str(error_msg)
     # CUDA

From 242d1ee29eb4833f0f2aba2d322e0380f8712ea5 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 25 Sep 2025 17:28:48 +0800
Subject: [PATCH 06/35] try to refine parse layer config code

---
 auto_round/compressors/base.py | 179 +++++++++++++++++++++++++++++++++
 auto_round/utils.py            |   5 +-
 2 files changed, 182 insertions(+), 2 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index f08540207..cbe95703b 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -401,6 +401,185 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
         else:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
+
+
+    # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different
+    def _convert_value_layer_config_to_dict(self,
+                                            layer_config: dict[str, Union[str, dict, QuantizationScheme]]) -> dict:
+
+        new_layer_config = {} if layer_config is None else layer_config
+        scheme_keys = [f.name for f in fields(QuantizationScheme)]
+        for key, item in new_layer_config.items():
+            if isinstance(item, str):
+                item = asdict(preset_name_to_scheme(item.upper()))
+                new_layer_config[key] = item
+            elif isinstance(item, QuantizationScheme):
+                config = asdict(item)
+                tmp_keys = copy.deepcopy(list(config.keys()))
+                for tmp_key in tmp_keys:  # Pop None value to be overridden
+                    if config[tmp_key] is None:
+                        config.pop(tmp_key)
+            elif isinstance(item, dict):
+                item_keys = item.keys()
+                if item_keys not in scheme_keys:
+                    for item_key in item_keys:
+                        if item_key not in scheme_keys:
+                            raise ValueError(
+                                f"the key {item_key} in layer_config for layer {key} is invalid,"
+                                f" only {scheme_keys} are supported"
+                            )
+            new_layer_config[key]["fixed_by_user"] = True
+        return new_layer_config
+
+    def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, dict], fp_layers, quant_lm_head,
+                             scheme, quant_block_list, supported_types, inner_supported_types):
+        """
+       Sets the layer-wise configuration based on the provided `layer_config`.
+       By default, only quantize layers in blocks.
+
+       Args:
+           layer_config (dict): The configuration dictionary for each layer containing various configuration options.
+
+       Returns:
+           bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head),
+                 otherwise returns False.
+       """
+
+        # set fp layers
+        not_quantize_layer_names = get_fp_layer_names(model, fp_layers)
+        # if len(not_quantize_layer_names) > 0:
+        #     logger.info(f"{not_quantize_layer_names} will not be quantized.")
+        if layer_config is None:
+            layer_config = {}
+        for name in not_quantize_layer_names:
+            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float",
+                                  "act_data_type": "float", "fixed_by_user": True}
+
+        # Get the names of layers in quantization blocks
+        layers_in_blocks = get_layer_names_in_block(
+            model, supported_types, quant_block_list, inner_supported_types
+        )
+        # Process regex in layer_config
+        all_supported_layer_names = []
+        # List of configuration keys
+        scheme_keys = (f.name for f in fields(QuantizationScheme))
+
+        for n, m in model.named_modules():
+            # Delete previous configuration to avoid conflicts with prior tuning
+            for key in scheme_keys:
+                if hasattr(m, key):
+                    delattr(m, key)
+            if type(m) not in supported_types and m.__class__.__name__ not in self.inner_supported_types:
+                continue
+            all_supported_layer_names.append(n)
+
+        names_in_layer_config = list(layer_config.keys())
+        for name in names_in_layer_config:
+            if name in all_supported_layer_names:
+                continue
+            matched_names = []
+            for layer_name in all_supported_layer_names:
+                if re.search(re.compile(name), layer_name) is not None:
+                    matched_names.append(layer_name)
+            if len(matched_names) > 0:
+                val = layer_config[name]
+                layer_config.pop(name)
+                for match_name in matched_names:
+                    layer_config[match_name] = val
+            else:
+                tmp_m = get_module(model, name)
+                if type(tmp_m) != torch.nn.Embedding:  # GGUF needs to quantize embedding layer
+                    raise ValueError(f"key {name} in layer_config is invalid, please have a double check")
+
+        has_qlayer_outside_block = False  # Flag to track if there are quantized layers outside blocks (e.g., lm-head)
+
+        # Iterate through all modules in the model
+        is_gguf = ("gguf" in scheme.lower() or
+                   (hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats)))
+        for n, m in model.named_modules():
+            # Skip unsupported types
+            if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types:
+                if n in layer_config:
+                    if not isinstance(m, torch.nn.Embedding):
+                        logger.warning(f"{n} is not supported, layer_config {n}: {layer_config[n]} will be ignored.")
+                        layer_config.pop(n)
+
+                    if not is_gguf:  # TODO the code here seems to could be deleted
+                        if not check_to_quantized(layer_config[n]):
+                            layer_config.pop(n)
+
+                continue
+
+            # If the layer is not in the config and is part of a quantization block, use default configuration
+            if n not in layer_config.keys() and n in layers_in_blocks:
+                layer_config[n] = {}
+                for key in scheme_keys:
+                    layer_config[n][key] = getattr(self, key)
+
+            # If the layer is partially configured, fill in missing values
+            elif n in layer_config.keys():
+                if "data_type" in layer_config[n] and "bits" not in layer_config[n]:
+                    tmp_bits = infer_bits_by_data_type(layer_config[n]["data_type"])
+                    if tmp_bits is not None and tmp_bits != self.bits:
+                        logger.warning(
+                            f"'data_type' do not match the specified 'bits' setting for {n}."
+                            f" Resetting 'bits' to {tmp_bits}."
+                        )
+                        layer_config[n]["bits"] = tmp_bits
+                if "act_data_type" in layer_config[n] and "act_bits" not in layer_config[n]:
+                    tmp_bits = infer_bits_by_data_type(layer_config[n]["act_data_type"])
+                    if tmp_bits is not None and tmp_bits != self.act_bits:
+                        logger.warning(
+                            f"'act_data_type' do not match the specified 'act_bits' setting for {n}."
+                            f" Resetting 'act_bits' to {tmp_bits}."
+                        )
+                        layer_config[n]["act_bits"] = tmp_bits
+
+                for key in scheme_keys:
+                    if key not in layer_config[n].keys():
+                        layer_config[n][key] = getattr(self, key)
+                layer_config[n]["fixed_by_user"] = True
+
+            # If the layer is not in the config and not part of a quantization block,
+            # use default configuration and set specific values
+            else:
+                layer_config[n] = {}
+                for key in scheme_keys:
+                    layer_config[n][key] = getattr(self, key)
+                layer_config[n]["bits"] = 16
+                layer_config[n]["act_bits"] = 16
+
+            if n in layers_in_blocks:
+                layer_config[n]["in_blocks"] = True
+            else:
+                layer_config[n]["in_blocks"] = False
+
+            # If the layer is outside a block and requires quantization, mark it as a quantized layer outside the block
+            if (
+                    n not in layers_in_blocks
+                    and check_to_quantized(layer_config[n])
+                    and not isinstance(m, torch.nn.Embedding)
+            ):
+                has_qlayer_outside_block = True
+
+            in_features, out_features = get_layer_features(m)
+            if in_features <= layer_config[n]["group_size"]:
+                layer_config[n]["group_size"] = -1
+
+            # Apply the configuration to the corresponding layer in the model
+            for key in scheme_keys:
+                setattr(m, key, layer_config[n][key])
+
+
+        # TODO self.quant_lm_head has not handleed yet
+
+        need_to_quantize_lm_head = self._check_need_to_quantize_lm_head_embedding()
+        if need_to_quantize_lm_head:
+            has_qlayer_outside_block = True
+
+        # Return whether there are quantized layers outside the blocks
+        return has_qlayer_outside_block
+
     def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers) -> None:
         """Parse and set the layer-wise quantization configuration."""
         not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers)
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 131a91db3..bd3d1d2b5 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -21,6 +21,7 @@
 import re
 import sys
 from collections import UserDict
+from dataclasses import fields
 from enum import Enum
 from functools import lru_cache
 from pathlib import Path
@@ -2278,8 +2279,8 @@ def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16):
     new_layer = torch.nn.Linear(layer.in_features, layer.out_features, bias=layer.bias is not None, dtype=dtype)
     if layer.bias is not None:
         new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype))
-
-    keys = get_quant_keys() + ["tmp_name"]
+    scheme_keys = [f.name for f in fields(QuantizationScheme)]
+    keys = scheme_keys + ["tmp_name"]
     for key in keys:
         setattr(new_layer, key, getattr(layer, key, None))
 

From 4fc6b64a56d0fa811f0f210833f366acebe9c918 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Sep 2025 09:29:46 +0000
Subject: [PATCH 07/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 61 ++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index cbe95703b..0cb65f332 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -401,11 +401,10 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
         else:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
-
-
     # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different
-    def _convert_value_layer_config_to_dict(self,
-                                            layer_config: dict[str, Union[str, dict, QuantizationScheme]]) -> dict:
+    def _convert_value_layer_config_to_dict(
+        self, layer_config: dict[str, Union[str, dict, QuantizationScheme]]
+    ) -> dict:
 
         new_layer_config = {} if layer_config is None else layer_config
         scheme_keys = [f.name for f in fields(QuantizationScheme)]
@@ -431,19 +430,28 @@ def _convert_value_layer_config_to_dict(self,
             new_layer_config[key]["fixed_by_user"] = True
         return new_layer_config
 
-    def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, dict], fp_layers, quant_lm_head,
-                             scheme, quant_block_list, supported_types, inner_supported_types):
+    def _expand_layer_config(
+        self,
+        model: torch.nn.Module,
+        layer_config: dict[str, dict],
+        fp_layers,
+        quant_lm_head,
+        scheme,
+        quant_block_list,
+        supported_types,
+        inner_supported_types,
+    ):
         """
-       Sets the layer-wise configuration based on the provided `layer_config`.
-       By default, only quantize layers in blocks.
+        Sets the layer-wise configuration based on the provided `layer_config`.
+        By default, only quantize layers in blocks.
 
-       Args:
-           layer_config (dict): The configuration dictionary for each layer containing various configuration options.
+        Args:
+            layer_config (dict): The configuration dictionary for each layer containing various configuration options.
 
-       Returns:
-           bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head),
-                 otherwise returns False.
-       """
+        Returns:
+            bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head),
+                  otherwise returns False.
+        """
 
         # set fp layers
         not_quantize_layer_names = get_fp_layer_names(model, fp_layers)
@@ -452,13 +460,16 @@ def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, d
         if layer_config is None:
             layer_config = {}
         for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float",
-                                  "act_data_type": "float", "fixed_by_user": True}
+            layer_config[name] = {
+                "bits": 16,
+                "act_bits": 16,
+                "data_type": "float",
+                "act_data_type": "float",
+                "fixed_by_user": True,
+            }
 
         # Get the names of layers in quantization blocks
-        layers_in_blocks = get_layer_names_in_block(
-            model, supported_types, quant_block_list, inner_supported_types
-        )
+        layers_in_blocks = get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types)
         # Process regex in layer_config
         all_supported_layer_names = []
         # List of configuration keys
@@ -494,8 +505,9 @@ def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, d
         has_qlayer_outside_block = False  # Flag to track if there are quantized layers outside blocks (e.g., lm-head)
 
         # Iterate through all modules in the model
-        is_gguf = ("gguf" in scheme.lower() or
-                   (hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats)))
+        is_gguf = "gguf" in scheme.lower() or (
+            hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats)
+        )
         for n, m in model.named_modules():
             # Skip unsupported types
             if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types:
@@ -556,9 +568,9 @@ def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, d
 
             # If the layer is outside a block and requires quantization, mark it as a quantized layer outside the block
             if (
-                    n not in layers_in_blocks
-                    and check_to_quantized(layer_config[n])
-                    and not isinstance(m, torch.nn.Embedding)
+                n not in layers_in_blocks
+                and check_to_quantized(layer_config[n])
+                and not isinstance(m, torch.nn.Embedding)
             ):
                 has_qlayer_outside_block = True
 
@@ -570,7 +582,6 @@ def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, d
             for key in scheme_keys:
                 setattr(m, key, layer_config[n][key])
 
-
         # TODO self.quant_lm_head has not handleed yet
 
         need_to_quantize_lm_head = self._check_need_to_quantize_lm_head_embedding()

From 7f76db26d3f3c8fe51b777928eb6ca078b22c138 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Sep 2025 05:08:34 +0000
Subject: [PATCH 08/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index e67adfc15..ecf87a62d 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -296,7 +296,7 @@ def __init__(
         for name in not_quantize_layer_names:
             layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"}
         self._parse_layer_config(layer_config)  # must place after model init
-        
+
         self.to_quant_block_names = to_quant_block_names
 
         # Set device, must place after model loading

From ae8837b0b2bb1e8ef8ae03085d4b6b728977e495 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 26 Sep 2025 13:29:59 +0800
Subject: [PATCH 09/35] fix

---
 auto_round/compressors/base.py | 2 +-
 auto_round/utils.py            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 53667c090..aaac722ba 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2145,7 +2145,7 @@ def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> b
         # Process regex in layer_config
         all_supported_layer_names = []
         # List of configuration keys
-        keys = [f.name for f in fields(QuantizationScheme)]
+        keys = (f.name for f in fields(QuantizationScheme)) + ("scale_dtype")
 
         for n, m in model.named_modules():
             # Delete previous configuration to avoid conflicts with prior tuning
diff --git a/auto_round/utils.py b/auto_round/utils.py
index dedaf5c2c..a48751d3e 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2279,8 +2279,8 @@ def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16):
     new_layer = torch.nn.Linear(layer.in_features, layer.out_features, bias=layer.bias is not None, dtype=dtype)
     if layer.bias is not None:
         new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype))
-    scheme_keys = [f.name for f in fields(QuantizationScheme)]
-    keys = scheme_keys + ["tmp_name"]
+    scheme_keys = (f.name for f in fields(QuantizationScheme))
+    keys = scheme_keys + ("tmp_name", "scale_dtype")
     for key in keys:
         setattr(new_layer, key, getattr(layer, key, None))
 

From 531224de42da3f9ed466bc30c15121ec5597e80f Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 26 Sep 2025 13:34:11 +0800
Subject: [PATCH 10/35] fix

---
 auto_round/compressors/base.py | 13 +++----------
 auto_round/utils.py            |  2 +-
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 71df8b89f..8c0b699a2 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -230,7 +230,7 @@ def __init__(
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
         # Scale factor for RAM usage per parameter.
         self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
-        fp_layers = kwargs.pop("fp_layers", None)
+        fp_layers = kwargs.pop("fp_layers", "")
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
@@ -288,14 +288,7 @@ def __init__(
             self.device_map = None
         self._set_device_map_in_blocks(self.device_map)
 
-        not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers)
-        if len(not_quantize_layer_names) > 0:
-            logger.info(f"{not_quantize_layer_names} will not be quantized.")
-        if layer_config is None:
-            layer_config = {}
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"}
-        self._parse_layer_config(layer_config)  # must place after model init
+        self._parse_layer_config(layer_config, fp_layers)  # Must place after model init
 
         self.to_quant_block_names = to_quant_block_names
 
@@ -611,7 +604,7 @@ def _expand_layer_config(
         # Return whether there are quantized layers outside the blocks
         return has_qlayer_outside_block
 
-    def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers) -> None:
+    def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers:str) -> None:
         """Parse and set the layer-wise quantization configuration."""
         not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers)
         if len(not_quantize_layer_names) > 0:
diff --git a/auto_round/utils.py b/auto_round/utils.py
index a48751d3e..1cb36f2fb 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -1046,7 +1046,7 @@ def can_pack_with_numba():  # pragma: no cover
     return True
 
 
-def get_fp_layer_names(model, fp_layers):
+def get_fp_layer_names(model:torch.nn.Module, fp_layers:str):
     """Identifies and returns layers in the model to exclude from quantization.
 
     This function processes a comma-separated list of fully precision (FP) layers,

From c9fa4088aee6d24b05993ea2d3766105ab6793db Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Sep 2025 05:34:44 +0000
Subject: [PATCH 11/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 4 +++-
 auto_round/utils.py            | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 8c0b699a2..bc83e041a 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -604,7 +604,9 @@ def _expand_layer_config(
         # Return whether there are quantized layers outside the blocks
         return has_qlayer_outside_block
 
-    def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers:str) -> None:
+    def _parse_layer_config(
+        self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers: str
+    ) -> None:
         """Parse and set the layer-wise quantization configuration."""
         not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers)
         if len(not_quantize_layer_names) > 0:
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 1cb36f2fb..8525d7a88 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -1046,7 +1046,7 @@ def can_pack_with_numba():  # pragma: no cover
     return True
 
 
-def get_fp_layer_names(model:torch.nn.Module, fp_layers:str):
+def get_fp_layer_names(model: torch.nn.Module, fp_layers: str):
     """Identifies and returns layers in the model to exclude from quantization.
 
     This function processes a comma-separated list of fully precision (FP) layers,

From 6453200001920d6c9a0402680aaf4507bc45924a Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 26 Sep 2025 13:41:32 +0800
Subject: [PATCH 12/35] fix

---
 auto_round/compressors/base.py | 2 +-
 auto_round/utils.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 8c0b699a2..2512ef5e8 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2158,7 +2158,7 @@ def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> b
         # Process regex in layer_config
         all_supported_layer_names = []
         # List of configuration keys
-        keys = (f.name for f in fields(QuantizationScheme)) + ("scale_dtype")
+        keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype")
 
         for n, m in model.named_modules():
             # Delete previous configuration to avoid conflicts with prior tuning
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 1cb36f2fb..9bbdbc161 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2280,7 +2280,7 @@ def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16):
     if layer.bias is not None:
         new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype))
     scheme_keys = (f.name for f in fields(QuantizationScheme))
-    keys = scheme_keys + ("tmp_name", "scale_dtype")
+    keys = tuple(scheme_keys) + ("tmp_name", "scale_dtype")
     for key in keys:
         setattr(new_layer, key, getattr(layer, key, None))
 

From 3811010768472c9da67b13213dc1d571d457a8cd Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 26 Sep 2025 14:07:19 +0800
Subject: [PATCH 13/35] tmp_change

---
 auto_round/compressors/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 48814853b..c7c3abe40 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -416,7 +416,7 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
 
     # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different
     def _convert_value_layer_config_to_dict(
-        self, layer_config: dict[str, Union[str, dict, QuantizationScheme]]
+        self, layer_config: dict[str, Union[str, dict, QuantizationScheme]],default_scheme:QuantizationScheme,
     ) -> dict:
 
         new_layer_config = {} if layer_config is None else layer_config
@@ -441,6 +441,7 @@ def _convert_value_layer_config_to_dict(
                                 f" only {scheme_keys} are supported"
                             )
             new_layer_config[key]["fixed_by_user"] = True
+
         return new_layer_config
 
     def _expand_layer_config(

From 4de7b0879cba422eac13532ed716df816b06c6ba Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 26 Sep 2025 14:43:26 +0800
Subject: [PATCH 14/35] commit

---
 auto_round/compressors/base.py | 53 ++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index c7c3abe40..067cddeda 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -416,31 +416,48 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
 
     # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different
     def _convert_value_layer_config_to_dict(
-        self, layer_config: dict[str, Union[str, dict, QuantizationScheme]],default_scheme:QuantizationScheme,
+            self,
+            layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
+            default_scheme: "QuantizationScheme",
+            use_auto_mixed_bit_in_gguf: bool = False,
     ) -> dict:
+        """
+        Convert layer_config values (string, dict, QuantizationScheme) into a standardized dict format.
+        Adds 'fixed_by_user': True for each processed layer config.
+        """
+        if layer_config is None:
+            return {}
+
+        scheme_keys = {f.name for f in fields(QuantizationScheme)}
+        new_layer_config = copy.deepcopy(layer_config)
 
-        new_layer_config = {} if layer_config is None else layer_config
-        scheme_keys = [f.name for f in fields(QuantizationScheme)]
         for key, item in new_layer_config.items():
             if isinstance(item, str):
-                item = asdict(preset_name_to_scheme(item.upper()))
-                new_layer_config[key] = item
+                # Convert preset name to scheme dict
+                config = asdict(preset_name_to_scheme(item.upper()))
             elif isinstance(item, QuantizationScheme):
                 config = asdict(item)
-                tmp_keys = copy.deepcopy(list(config.keys()))
-                for tmp_key in tmp_keys:  # Pop None value to be overridden
-                    if config[tmp_key] is None:
-                        config.pop(tmp_key)
             elif isinstance(item, dict):
-                item_keys = item.keys()
-                if item_keys not in scheme_keys:
-                    for item_key in item_keys:
-                        if item_key not in scheme_keys:
-                            raise ValueError(
-                                f"the key {item_key} in layer_config for layer {key} is invalid,"
-                                f" only {scheme_keys} are supported"
-                            )
-            new_layer_config[key]["fixed_by_user"] = True
+                # Validate dict keys
+                invalid_keys = set(item) - scheme_keys
+                if invalid_keys:
+                    raise ValueError(
+                        f"Invalid keys {invalid_keys} in layer_config for layer '{key}', "
+                        f"only {scheme_keys} are supported."
+                    )
+                config = dict(item)
+            else:
+                raise TypeError(
+                    f"Unsupported type for layer_config[{key}]: {type(item)}. "
+                    f"Expected str, dict, or QuantizationScheme."
+                )
+
+            # Drop None values
+            config = {k: v for k, v in config.items() if v is not None}
+
+            # Mark as user-fixed
+            config["fixed_by_user"] = True
+            new_layer_config[key] = config
 
         return new_layer_config
 

From a9f0e444fff29ca12aafd6dd24bc7e8a933534c1 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 26 Sep 2025 14:44:44 +0800
Subject: [PATCH 15/35] commit

---
 auto_round/compressors/base.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 067cddeda..13ef303a0 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -446,11 +446,6 @@ def _convert_value_layer_config_to_dict(
                         f"only {scheme_keys} are supported."
                     )
                 config = dict(item)
-            else:
-                raise TypeError(
-                    f"Unsupported type for layer_config[{key}]: {type(item)}. "
-                    f"Expected str, dict, or QuantizationScheme."
-                )
 
             # Drop None values
             config = {k: v for k, v in config.items() if v is not None}

From 59a9f5df246da7d9676d6315435b6626da07e582 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 26 Sep 2025 16:39:17 +0800
Subject: [PATCH 16/35] update a little

---
 auto_round/compressors/base.py | 116 +++++++++++++++++++++++++++------
 auto_round/schemes.py          |  14 +++-
 2 files changed, 109 insertions(+), 21 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 13ef303a0..c12aea15f 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -414,47 +414,125 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
         else:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
-    # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different
-    def _convert_value_layer_config_to_dict(
+    def _prepare_layer_config(
             self,
-            layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
+            model: torch.nn.Module,
+            orig_layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
             default_scheme: "QuantizationScheme",
-            use_auto_mixed_bit_in_gguf: bool = False,
+            supported_types,
+            inner_supported_types,
+            fp_layers: str = "",
+            quant_lm_head: bool = False,
     ) -> dict:
         """
-        Convert layer_config values (string, dict, QuantizationScheme) into a standardized dict format.
-        Adds 'fixed_by_user': True for each processed layer config.
+        Normalize and validate layer-specific quantization schemes,
+        expand regex-based configs, and merge with default scheme.
         """
-        if layer_config is None:
-            return {}
+        from auto_round.schemes import is_gguf_scheme
 
         scheme_keys = {f.name for f in fields(QuantizationScheme)}
-        new_layer_config = copy.deepcopy(layer_config)
+        layer_config = copy.deepcopy(orig_layer_config) or {}
+
+        # Mark layers that should stay in FP
+        not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers)
+        for name in not_quantize_layer_names:
+            layer_config[name] = {
+                "bits": 16,
+                "act_bits": 16,
+                "data_type": "float",
+                "act_data_type": "float",
+            }
 
-        for key, item in new_layer_config.items():
+        def normalize_item(item, layer_name: str) -> dict:
+            """Convert a single config entry to dict and validate keys."""
             if isinstance(item, str):
-                # Convert preset name to scheme dict
                 config = asdict(preset_name_to_scheme(item.upper()))
             elif isinstance(item, QuantizationScheme):
                 config = asdict(item)
             elif isinstance(item, dict):
-                # Validate dict keys
                 invalid_keys = set(item) - scheme_keys
                 if invalid_keys:
                     raise ValueError(
-                        f"Invalid keys {invalid_keys} in layer_config for layer '{key}', "
+                        f"Invalid keys {invalid_keys} in layer_config for layer '{layer_name}', "
                         f"only {scheme_keys} are supported."
                     )
                 config = dict(item)
-
-            # Drop None values
+            else:
+                raise TypeError(
+                    f"Unsupported type for layer_config[{layer_name}]: {type(item)}. "
+                    f"Expected str, dict, or QuantizationScheme."
+                )
+            # Drop None values & mark as fixed
             config = {k: v for k, v in config.items() if v is not None}
-
-            # Mark as user-fixed
             config["fixed_by_user"] = True
-            new_layer_config[key] = config
+            return config
+
+        # Normalize configs
+        layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()}
+
+        # Infer missing bits from data_type / act_data_type
+        for cfg in layer_config.values():
+            if "data_type" in cfg and "bits" not in cfg:
+                if (tmp_bits := infer_bits_by_data_type(cfg["data_type"])) is not None:
+                    cfg["bits"] = tmp_bits
+            if "act_data_type" in cfg and "act_bits" not in cfg:
+                if (tmp_bits := infer_bits_by_data_type(cfg["act_data_type"])) is not None:
+                    cfg["act_bits"] = tmp_bits
+
+        # Fill missing values from default scheme
+        default_dict = asdict(default_scheme)
+        for cfg in layer_config.values():
+            for scheme_key in scheme_keys:
+                cfg.setdefault(scheme_key, default_dict.get(scheme_key))
+
+        # Special case for GGUF
+        is_gguf = is_gguf_scheme(default_scheme)
+        if is_gguf and torch.nn.Embedding not in supported_types:
+            supported_types = tuple(list(supported_types) + [torch.nn.Embedding])
+
+        # Collect all supported layer names
+        all_supported_layer_names = []
+        for n, m in model.named_modules():
+            # Clear old attributes to avoid conflicts
+            for key in scheme_keys:
+                if hasattr(m, key):
+                    delattr(m, key)
+            if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types:
+                continue
+            all_supported_layer_names.append(n)
+
+        # Expand regex configs (compile once, reuse)
+        for name in list(layer_config.keys()):
+            if name in all_supported_layer_names:
+                continue
+            regex = re.compile(name)
+            matched_names = [ln for ln in all_supported_layer_names if regex.search(ln)]
+            if matched_names:
+                val = layer_config.pop(name)
+                for match_name in matched_names:
+                    layer_config[match_name] = val
+            else:
+                raise ValueError(f"Key '{name}' in layer_config is invalid, please double check.")
+
+        # Enforce group_size = 32 constraint for INT weight-only quantization
+        if default_scheme.data_type == "int" and default_scheme.act_bits >= 16 and not is_gguf:
+            for n, m in model.named_modules():
+                if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
+                    if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+                        if n in layer_config:
+                            layer_config[n]["bits"] = 16
+                            layer_config[n]["data_type"] = "fp"
+                        logger.warning_once(
+                            f"{n} will not be quantized because its shape is not divisible by 32. "
+                            "It will be exported in FP16 instead."
+                        )
+
+        # Handle lm_head
+        lm_head_name = get_lm_head_name(model)
+        if lm_head_name not in layer_config and (quant_lm_head or is_gguf):
+            layer_config[lm_head_name] = default_dict.copy()
 
-        return new_layer_config
+        return layer_config
 
     def _expand_layer_config(
         self,
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index af51a881e..9c12b61c0 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -14,9 +14,9 @@
 import copy
 from copy import deepcopy
 from dataclasses import dataclass, fields
-from typing import Iterable, Optional
+from typing import Iterable, Optional, Union
 
-__all__ = ["QuantizationScheme", "preset_name_to_scheme", "AutoScheme"]
+__all__ = ["QuantizationScheme", "is_gguf_scheme", "preset_name_to_scheme", "AutoScheme"]
 
 
 @dataclass
@@ -235,6 +235,16 @@ def is_preset_scheme(name: str) -> bool:
     value.pop("lm_head", None)
     PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value)
 
+def is_gguf_scheme(scheme:Union[str, QuantizationScheme])->bool:
+    if isinstance(scheme,str) and scheme.upper().startswith("GGUF"):
+        return True
+    for key, val in PRESET_SCHEMES.items():
+        if not key.upper().startswith("GGUF"):
+            continue
+        if val==scheme:
+            return True
+    return False
+
 
 @dataclass
 class AutoScheme:

From 1b7e911656995558c6ea900a09f97f705cadd089 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Sep 2025 08:39:58 +0000
Subject: [PATCH 17/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 16 ++++++++--------
 auto_round/schemes.py          |  7 ++++---
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index c12aea15f..e8bb2fad2 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -415,14 +415,14 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
     def _prepare_layer_config(
-            self,
-            model: torch.nn.Module,
-            orig_layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
-            default_scheme: "QuantizationScheme",
-            supported_types,
-            inner_supported_types,
-            fp_layers: str = "",
-            quant_lm_head: bool = False,
+        self,
+        model: torch.nn.Module,
+        orig_layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
+        default_scheme: "QuantizationScheme",
+        supported_types,
+        inner_supported_types,
+        fp_layers: str = "",
+        quant_lm_head: bool = False,
     ) -> dict:
         """
         Normalize and validate layer-specific quantization schemes,
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index 9c12b61c0..97a3cdf02 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -235,13 +235,14 @@ def is_preset_scheme(name: str) -> bool:
     value.pop("lm_head", None)
     PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value)
 
-def is_gguf_scheme(scheme:Union[str, QuantizationScheme])->bool:
-    if isinstance(scheme,str) and scheme.upper().startswith("GGUF"):
+
+def is_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> bool:
+    if isinstance(scheme, str) and scheme.upper().startswith("GGUF"):
         return True
     for key, val in PRESET_SCHEMES.items():
         if not key.upper().startswith("GGUF"):
             continue
-        if val==scheme:
+        if val == scheme:
             return True
     return False
 

From e0680493b54b9e64e0eb8d26219435c2c3f58170 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 26 Sep 2025 16:44:08 +0800
Subject: [PATCH 18/35] fix

---
 auto_round/compressors/base.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index c12aea15f..7538fc362 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -519,12 +519,13 @@ def normalize_item(item, layer_name: str) -> dict:
             for n, m in model.named_modules():
                 if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
                     if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                        if n in layer_config:
-                            layer_config[n]["bits"] = 16
-                            layer_config[n]["data_type"] = "fp"
+                        if n not in layer_config:
+                            layer_config[n] = default_dict.copy()
+                        layer_config[n]["bits"] = 16
+                        layer_config[n]["data_type"] = "fp"
+                        layer_config[n]["fixed_by_user"] = True
                         logger.warning_once(
                             f"{n} will not be quantized because its shape is not divisible by 32. "
-                            "It will be exported in FP16 instead."
                         )
 
         # Handle lm_head

From 0357c0b94b7070da52c9492212be961fab69994e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Sep 2025 08:45:30 +0000
Subject: [PATCH 19/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index a1558d247..182cc435f 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -524,9 +524,7 @@ def normalize_item(item, layer_name: str) -> dict:
                         layer_config[n]["bits"] = 16
                         layer_config[n]["data_type"] = "fp"
                         layer_config[n]["fixed_by_user"] = True
-                        logger.warning_once(
-                            f"{n} will not be quantized because its shape is not divisible by 32. "
-                        )
+                        logger.warning_once(f"{n} will not be quantized because its shape is not divisible by 32. ")
 
         # Handle lm_head
         lm_head_name = get_lm_head_name(model)

From 602421c6ab2340476ece3a6959409b7d58be8320 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 26 Sep 2025 18:22:47 +0800
Subject: [PATCH 20/35] merge autoscheme to scheme

---
 auto_round/compressors/base.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 8dbb0e3cc..0e21e00d5 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -128,8 +128,7 @@ def __init__(
         self,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
-        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
-        auto_scheme: AutoScheme = None,
+        scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         iters: int = 200,
@@ -2247,7 +2246,7 @@ def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> b
         # Process regex in layer_config
         all_supported_layer_names = []
         # List of configuration keys
-        keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype")
+        keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",)
 
         for n, m in model.named_modules():
             # Delete previous configuration to avoid conflicts with prior tuning

From 091c5ad0e9045dbcc693aa3163ca4ec476ddf44f Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 29 Sep 2025 14:31:43 +0800
Subject: [PATCH 21/35] refine layer_config code

---
 auto_round/__main__.py         |  10 +
 auto_round/autoround.py        |   4 +-
 auto_round/compressors/base.py | 635 +++++++++------------------------
 auto_round/schemes.py          |   8 +-
 auto_round/utils.py            |  18 +-
 5 files changed, 191 insertions(+), 484 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 07bc3f273..43f55a050 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -470,6 +470,15 @@ def tune(args):
     extra_config.scheme_config = scheme_config
     extra_config.mllm_config = mllm_config
 
+    layer_config = {}
+    # from auto_round.auto_schemes.haha import get_mixed_config_layer_config
+    # layer_config = {}
+    # best_path = get_mixed_config_layer_config(model_name, target_bits=3)
+    # for item in best_path:
+    #     layer_config[item[0]] = {}
+    #     layer_config[item[0]]["bits"] = item[1]
+
+
     autoround: BaseCompressor = AutoRound(
         model=model_name,
         scheme=scheme,
@@ -486,6 +495,7 @@ def tune(args):
         not_use_best_mse=args.not_use_best_mse,
         enable_adam=args.adam,
         extra_config=extra_config,
+        layer_config=layer_config,
     )
 
     model_name = args.model.rstrip("/")
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index ae1a37677..ccdca1f09 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -63,8 +63,7 @@ def __new__(
         cls,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
-        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
-        auto_scheme: AutoScheme = None,
+        scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         iters: int = 200,
@@ -159,7 +158,6 @@ def __new__(
             model=model,
             tokenizer=tokenizer,
             scheme=scheme,
-            auto_scheme=auto_scheme,
             layer_config=layer_config,
             dataset=dataset,
             iters=iters,
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 0e21e00d5..590c480e8 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -35,7 +35,7 @@
 from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType
 from auto_round.logger import logger
 from auto_round.low_cpu_mem.utils import get_layers_before_block
-from auto_round.schemes import AutoScheme, QuantizationScheme, preset_name_to_scheme
+from auto_round.schemes import AutoScheme, QuantizationScheme, preset_name_to_scheme, get_gguf_scheme
 from auto_round.sign_sgd import SignSGD
 from auto_round.special_model_handler import _handle_moe_model
 from auto_round.utils import (
@@ -201,8 +201,7 @@ def __init__(
             ...     # ...
             ... }
         """
-        self.scheme = None
-        self._parse_and_set_scheme(scheme, kwargs)
+        self.scheme = self._parse_and_set_scheme(scheme, kwargs)
         # Extra/legacy kwargs for backward compatibility
         # Major version releases may pack them with extra configuration options
         amp = kwargs.pop("amp", True)
@@ -229,7 +228,8 @@ def __init__(
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
         # Scale factor for RAM usage per parameter.
         self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
-        fp_layers = kwargs.pop("fp_layers", "")
+        self.fp_layers = kwargs.pop("fp_layers", "")
+        self.layer_config = layer_config
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
@@ -287,7 +287,7 @@ def __init__(
             self.device_map = None
         self._set_device_map_in_blocks(self.device_map)
 
-        self._parse_layer_config(layer_config, fp_layers)  # Must place after model init
+        # self._parse_layer_config(layer_config, fp_layers)  # Must place after model init
 
         self.to_quant_block_names = to_quant_block_names
 
@@ -414,46 +414,46 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
     def _prepare_layer_config(
-        self,
-        model: torch.nn.Module,
-        orig_layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
-        default_scheme: "QuantizationScheme",
-        supported_types,
-        inner_supported_types,
-        fp_layers: str = "",
-        quant_lm_head: bool = False,
-    ) -> dict:
+            self,
+            model: torch.nn.Module,
+            layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
+            default_scheme: "QuantizationScheme",
+            default_scale_dtype: torch.dtype | str,
+            supported_types: tuple,
+            inner_supported_types: tuple,
+            quant_block_list=None,
+            fp_layers: str = "",
+            quant_lm_head: bool = False,
+            enable_gguf_official_mixed: bool = True,
+            is_mllm: bool = False,
+    ) -> tuple[dict, bool]:
         """
-        Normalize and validate layer-specific quantization schemes,
-        expand regex-based configs, and merge with default scheme.
+        Normalize, validate, and expand layer-specific quantization configs.
+        Returns (final_layer_config, has_quant_layer_outside_block)
         """
-        from auto_round.schemes import is_gguf_scheme
 
-        scheme_keys = {f.name for f in fields(QuantizationScheme)}
-        layer_config = copy.deepcopy(orig_layer_config) or {}
+        from auto_round.schemes import get_gguf_scheme
 
-        # Mark layers that should stay in FP
-        not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers)
-        for name in not_quantize_layer_names:
-            layer_config[name] = {
-                "bits": 16,
-                "act_bits": 16,
-                "data_type": "float",
-                "act_data_type": "float",
-            }
+        # ---- helpers -------------------------------------------------
+        def dispatch_layer_config(layer_config: dict[str, dict]) -> None:
+            """Assign scheme values as attributes to matched modules."""
+            for layer_name, scheme in layer_config.items():
+                module = get_module(model, layer_name)
+                for attr, value in scheme.items():
+                    setattr(module, attr, value)
 
-        def normalize_item(item, layer_name: str) -> dict:
-            """Convert a single config entry to dict and validate keys."""
+        def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict:
+            """Convert config entry into dict and validate keys."""
             if isinstance(item, str):
                 config = asdict(preset_name_to_scheme(item.upper()))
             elif isinstance(item, QuantizationScheme):
                 config = asdict(item)
             elif isinstance(item, dict):
-                invalid_keys = set(item) - scheme_keys
-                if invalid_keys:
+                invalid = set(item) - set(scheme_keys)
+                if invalid:
                     raise ValueError(
-                        f"Invalid keys {invalid_keys} in layer_config for layer '{layer_name}', "
-                        f"only {scheme_keys} are supported."
+                        f"Invalid keys {invalid} in layer_config for '{layer_name}'. "
+                        f"Allowed keys: {scheme_keys}"
                     )
                 config = dict(item)
             else:
@@ -461,237 +461,135 @@ def normalize_item(item, layer_name: str) -> dict:
                     f"Unsupported type for layer_config[{layer_name}]: {type(item)}. "
                     f"Expected str, dict, or QuantizationScheme."
                 )
-            # Drop None values & mark as fixed
+            # Clean up
             config = {k: v for k, v in config.items() if v is not None}
             config["fixed_by_user"] = True
             return config
 
-        # Normalize configs
+        # ---- main logic ----------------------------------------------
+        scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",)
+        layer_config = copy.deepcopy(layer_config) or {}
+
+        # 1. fp_layers -> force 16
+        for name in get_fp_layer_names(self.model, fp_layers):
+            layer_config[name] = {
+                "bits": 16, "act_bits": 16,
+                "data_type": "float", "act_data_type": "float"
+            }
+
+        # 2. normalize
         layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()}
 
-        # Infer missing bits from data_type / act_data_type
+        # 3. infer missing bits
         for cfg in layer_config.values():
             if "data_type" in cfg and "bits" not in cfg:
-                if (tmp_bits := infer_bits_by_data_type(cfg["data_type"])) is not None:
-                    cfg["bits"] = tmp_bits
+                if (b := infer_bits_by_data_type(cfg["data_type"])) is not None:
+                    cfg["bits"] = b
             if "act_data_type" in cfg and "act_bits" not in cfg:
-                if (tmp_bits := infer_bits_by_data_type(cfg["act_data_type"])) is not None:
-                    cfg["act_bits"] = tmp_bits
+                if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None:
+                    cfg["act_bits"] = b
 
-        # Fill missing values from default scheme
+        # 4. fill defaults
         default_dict = asdict(default_scheme)
+        default_dict["scale_dtype"] = default_scale_dtype
         for cfg in layer_config.values():
-            for scheme_key in scheme_keys:
-                cfg.setdefault(scheme_key, default_dict.get(scheme_key))
+            for key in scheme_keys:
+                cfg.setdefault(key, default_dict.get(key))
 
-        # Special case for GGUF
-        is_gguf = is_gguf_scheme(default_scheme)
-        if is_gguf and torch.nn.Embedding not in supported_types:
-            supported_types = tuple(list(supported_types) + [torch.nn.Embedding])
+        # 5. collect supported modules
+        gguf_name = get_gguf_scheme(default_scheme)
+        if gguf_name and torch.nn.Embedding not in supported_types:
+            supported_types = (*supported_types, torch.nn.Embedding)
 
-        # Collect all supported layer names
-        all_supported_layer_names = []
+        all_layer_names, embedding_layer_names = [], []
         for n, m in model.named_modules():
-            # Clear old attributes to avoid conflicts
+            # cleanup stale attributes
             for key in scheme_keys:
                 if hasattr(m, key):
                     delattr(m, key)
             if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types:
                 continue
-            all_supported_layer_names.append(n)
+            all_layer_names.append(n)
+            if isinstance(m, torch.nn.Embedding):
+                embedding_layer_names.append(n)
 
-        # Expand regex configs (compile once, reuse)
+        # 6. expand regex configs
         for name in list(layer_config.keys()):
-            if name in all_supported_layer_names:
+            if name in all_layer_names:
                 continue
             regex = re.compile(name)
-            matched_names = [ln for ln in all_supported_layer_names if regex.search(ln)]
-            if matched_names:
-                val = layer_config.pop(name)
-                for match_name in matched_names:
-                    layer_config[match_name] = val
-            else:
-                raise ValueError(f"Key '{name}' in layer_config is invalid, please double check.")
-
-        # Enforce group_size = 32 constraint for INT weight-only quantization
-        if default_scheme.data_type == "int" and default_scheme.act_bits >= 16 and not is_gguf:
-            for n, m in model.named_modules():
-                if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
-                    if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                        if n not in layer_config:
-                            layer_config[n] = default_dict.copy()
-                        layer_config[n]["bits"] = 16
-                        layer_config[n]["data_type"] = "fp"
-                        layer_config[n]["fixed_by_user"] = True
-                        logger.warning_once(f"{n} will not be quantized because its shape is not divisible by 32. ")
-
-        # Handle lm_head
+            matched = [ln for ln in all_layer_names if regex.search(ln)]
+            if not matched:
+                raise ValueError(f"Invalid regex '{name}' in layer_config, no match found.")
+            val = layer_config.pop(name)
+            for match in matched:
+                layer_config[match] = val
+
+        # 7. lm_head
         lm_head_name = get_lm_head_name(model)
-        if lm_head_name not in layer_config and (quant_lm_head or is_gguf):
+        tied_lm_head = False
+        if (
+            hasattr(model, "config")
+            and model.config.tie_word_embeddings
+            and hasattr(model, "_tied_weights_keys")
+        ):
+            tied_keys =model._tied_weights_keys
+            if lm_head_name in tied_keys:
+                tied_lm_head=True
+        if quant_lm_head and tied_lm_head:
+            quant_lm_head=False
+            logger.warning("reset `quant_lm_head` to false as quantizing lm_head with tied weights has not been supported currently")
+
+        if lm_head_name not in layer_config and quant_lm_head:
             layer_config[lm_head_name] = default_dict.copy()
 
-        return layer_config
-
-    def _expand_layer_config(
-        self,
-        model: torch.nn.Module,
-        layer_config: dict[str, dict],
-        fp_layers,
-        quant_lm_head,
-        scheme,
-        quant_block_list,
-        supported_types,
-        inner_supported_types,
-    ):
-        """
-        Sets the layer-wise configuration based on the provided `layer_config`.
-        By default, only quantize layers in blocks.
-
-        Args:
-            layer_config (dict): The configuration dictionary for each layer containing various configuration options.
-
-        Returns:
-            bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head),
-                  otherwise returns False.
-        """
-
-        # set fp layers
-        not_quantize_layer_names = get_fp_layer_names(model, fp_layers)
-        # if len(not_quantize_layer_names) > 0:
-        #     logger.info(f"{not_quantize_layer_names} will not be quantized.")
-        if layer_config is None:
-            layer_config = {}
-        for name in not_quantize_layer_names:
-            layer_config[name] = {
-                "bits": 16,
-                "act_bits": 16,
-                "data_type": "float",
-                "act_data_type": "float",
-                "fixed_by_user": True,
-            }
-
-        # Get the names of layers in quantization blocks
-        layers_in_blocks = get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types)
-        # Process regex in layer_config
-        all_supported_layer_names = []
-        # List of configuration keys
-        scheme_keys = (f.name for f in fields(QuantizationScheme))
-
-        for n, m in model.named_modules():
-            # Delete previous configuration to avoid conflicts with prior tuning
-            for key in scheme_keys:
-                if hasattr(m, key):
-                    delattr(m, key)
-            if type(m) not in supported_types and m.__class__.__name__ not in self.inner_supported_types:
-                continue
-            all_supported_layer_names.append(n)
-
-        names_in_layer_config = list(layer_config.keys())
-        for name in names_in_layer_config:
-            if name in all_supported_layer_names:
-                continue
-            matched_names = []
-            for layer_name in all_supported_layer_names:
-                if re.search(re.compile(name), layer_name) is not None:
-                    matched_names.append(layer_name)
-            if len(matched_names) > 0:
-                val = layer_config[name]
-                layer_config.pop(name)
-                for match_name in matched_names:
-                    layer_config[match_name] = val
-            else:
-                tmp_m = get_module(model, name)
-                if type(tmp_m) != torch.nn.Embedding:  # GGUF needs to quantize embedding layer
-                    raise ValueError(f"key {name} in layer_config is invalid, please have a double check")
-
-        has_qlayer_outside_block = False  # Flag to track if there are quantized layers outside blocks (e.g., lm-head)
-
-        # Iterate through all modules in the model
-        is_gguf = "gguf" in scheme.lower() or (
-            hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats)
-        )
-        for n, m in model.named_modules():
-            # Skip unsupported types
-            if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types:
-                if n in layer_config:
-                    if not isinstance(m, torch.nn.Embedding):
-                        logger.warning(f"{n} is not supported, layer_config {n}: {layer_config[n]} will be ignored.")
-                        layer_config.pop(n)
-
-                    if not is_gguf:  # TODO the code here seems to could be deleted
-                        if not check_to_quantized(layer_config[n]):
-                            layer_config.pop(n)
-
-                continue
-
-            # If the layer is not in the config and is part of a quantization block, use default configuration
-            if n not in layer_config.keys() and n in layers_in_blocks:
-                layer_config[n] = {}
-                for key in scheme_keys:
-                    layer_config[n][key] = getattr(self, key)
-
-            # If the layer is partially configured, fill in missing values
-            elif n in layer_config.keys():
-                if "data_type" in layer_config[n] and "bits" not in layer_config[n]:
-                    tmp_bits = infer_bits_by_data_type(layer_config[n]["data_type"])
-                    if tmp_bits is not None and tmp_bits != self.bits:
-                        logger.warning(
-                            f"'data_type' do not match the specified 'bits' setting for {n}."
-                            f" Resetting 'bits' to {tmp_bits}."
-                        )
-                        layer_config[n]["bits"] = tmp_bits
-                if "act_data_type" in layer_config[n] and "act_bits" not in layer_config[n]:
-                    tmp_bits = infer_bits_by_data_type(layer_config[n]["act_data_type"])
-                    if tmp_bits is not None and tmp_bits != self.act_bits:
-                        logger.warning(
-                            f"'act_data_type' do not match the specified 'act_bits' setting for {n}."
-                            f" Resetting 'act_bits' to {tmp_bits}."
-                        )
-                        layer_config[n]["act_bits"] = tmp_bits
-
-                for key in scheme_keys:
-                    if key not in layer_config[n].keys():
-                        layer_config[n][key] = getattr(self, key)
-                layer_config[n]["fixed_by_user"] = True
-
-            # If the layer is not in the config and not part of a quantization block,
-            # use default configuration and set specific values
-            else:
-                layer_config[n] = {}
-                for key in scheme_keys:
-                    layer_config[n][key] = getattr(self, key)
-                layer_config[n]["bits"] = 16
-                layer_config[n]["act_bits"] = 16
-
-            if n in layers_in_blocks:
-                layer_config[n]["in_blocks"] = True
-            else:
-                layer_config[n]["in_blocks"] = False
-
-            # If the layer is outside a block and requires quantization, mark it as a quantized layer outside the block
-            if (
-                n not in layers_in_blocks
-                and check_to_quantized(layer_config[n])
-                and not isinstance(m, torch.nn.Embedding)
-            ):
+        # 8. enforce shape divisibility for int weight-only
+        if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name:
+            for n, m in model.named_modules():
+                if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
+                    if m.weight.shape[0] % 32 or m.weight.shape[1] % 32:
+                        layer_config.setdefault(n, default_dict.copy())
+                        layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True})
+                        logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).")
+
+        # 9. block layers: mark as in_blocks=True
+        for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types):
+            cfg = layer_config.setdefault(name, default_dict.copy())
+            cfg["in_blocks"] = True
+
+        # ---- restore: ensure missing in_blocks are set to False and compute flag ----
+        has_qlayer_outside_block = False
+        for cfg in layer_config.values():
+            if "in_blocks" not in cfg:
+                cfg["in_blocks"] = False
+            # 如果 layer 不在 blocks 且需要量化，则标记存在 blocks 外的量化层
+            if not cfg["in_blocks"] and check_to_quantized(cfg):
                 has_qlayer_outside_block = True
 
-            in_features, out_features = get_layer_features(m)
-            if in_features <= layer_config[n]["group_size"]:
-                layer_config[n]["group_size"] = -1
+        # 10. GGUF handling
+        if not gguf_name:
+            dispatch_layer_config(layer_config)
+            return layer_config, has_qlayer_outside_block
 
-            # Apply the configuration to the corresponding layer in the model
-            for key in scheme_keys:
-                setattr(m, key, layer_config[n][key])
+        # embed + lm_head defaults for gguf
+        if lm_head_name not in layer_config and not tied_lm_head:
+            cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]]
+            cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype}
+            layer_config[lm_head_name] = cfg
+            has_qlayer_outside_block = True
+        for emd_name in embedding_layer_names:
+            cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]]
+            cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype}
+            layer_config[emd_name] = cfg
 
-        # TODO self.quant_lm_head has not handleed yet
+        if enable_gguf_official_mixed:
+            model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT
+            layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type)
+
+        dispatch_layer_config(layer_config)
+        return layer_config, has_qlayer_outside_block
 
-        need_to_quantize_lm_head = self._check_need_to_quantize_lm_head_embedding()
-        if need_to_quantize_lm_head:
-            has_qlayer_outside_block = True
 
-        # Return whether there are quantized layers outside the blocks
-        return has_qlayer_outside_block
 
     def _parse_layer_config(
         self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers: str
@@ -753,7 +651,7 @@ def _parse_layer_config(
             if key not in lm_head_layer_config:
                 lm_head_layer_config[key] = getattr(self, key)
 
-    def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> None:
+    def  _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme:
         """Parse and set the quantization scheme."""
         if isinstance(scheme, QuantizationScheme):
             scheme = asdict(scheme)
@@ -761,7 +659,6 @@ def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kw
             scheme = scheme
         elif isinstance(scheme, str):
             scheme = scheme.upper()
-            self.scheme = scheme
             scheme = asdict(preset_name_to_scheme(scheme))
         scheme_keys = [f.name for f in fields(QuantizationScheme)]
         for key in scheme_keys:
@@ -807,6 +704,9 @@ def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kw
                     if supported_dtype + str(tmp_act_bits) == self.act_data_type:  # could not replace FP8_e4m3
                         self.act_data_type = supported_dtype
                     break
+        for key in scheme_keys:
+            scheme[key] = getattr(self, key)
+        return QuantizationScheme.from_dict(scheme)
 
     def _adjust_torch_compile(self, enable_torch_compile: bool) -> None:
         """Sets the torch compile configuration for the tuning."""
@@ -1112,19 +1012,29 @@ def remove_duplicates(lst):
         formats = format.replace("q*_", f"q{self.bits}_").replace(" ", "").split(",")
         formats = remove_duplicates(formats)  # need the keep origin order
 
-        if isinstance(self.scheme, str) and self.scheme.lower().startswith("gguf"):
+        gguf_format_name = get_gguf_scheme(self.scheme)
+
+        if gguf_format_name:
             for i in range(len(formats)):
-                if formats[i] != "fake" and formats[i] != self.scheme.lower():
+                if formats[i] != "fake" and formats[i] != gguf_format_name.lower():
                     logger.warning(
-                        f"reset format {formats[i]} to {self.scheme.lower()} "
-                        f"since scheme {self.scheme} can only be exported to format {self.scheme.lower()}"
+                        f"reset format {formats[i]} to {gguf_format_name.lower()} "
+                        f"since scheme {gguf_format_name} can only be exported to format {gguf_format_name.lower()}"
                     )
-                    formats[i] = self.scheme.lower()
+                    formats[i] = gguf_format_name.lower()
+
 
         _gguf_args_check(self, formats, model_type=ModelType.TEXT)
         if self.mllm:
             _gguf_args_check(self, formats, model_type=ModelType.MMPROJ)
 
+        for f in formats:
+            if f.startswith("gguf"):
+                self.scheme = preset_name_to_scheme(f)
+                break
+
+
+
         for format_ in formats:
             if format_ not in SUPPORTED_FORMATS:
                 logger.error(f"Unsupported format {format_}, please choose from {SUPPORTED_FORMATS}")
@@ -1608,91 +1518,6 @@ def get_imatrix_hook(module, input, output):
             for hook in hooks:
                 hook.remove()
 
-    def _check_need_to_quantize_lm_head_embedding(self) -> bool:
-        """Checks if LM head and embedding layers need quantization for GGUF format.
-
-        This function inspects the current model's formats and determines whether
-        it needs to apply quantization settings to the embedding and LM head layers.
-        The function modifies `self.layer_config` in-place and updates the model modules.
-
-        Returns:
-            bool: True if the LM head needs quantization, otherwise False.
-
-        Raises:
-            NotImplementedError: If multiple non-fake GGUF formats are specified.
-        """
-        gguf_scheme = False
-        if isinstance(self.scheme, str) and "gguf" in self.scheme.lower():
-            gguf_scheme = True
-
-        if not hasattr(self, "formats") and not gguf_scheme:
-            return False
-
-        has_gguf: bool = gguf_scheme or any("gguf" in fmt for fmt in self.formats)
-        if not has_gguf:
-            return False
-        if hasattr(self, "formats"):
-            formats: list[str] = [fmt for fmt in self.formats if "fake" not in fmt]
-            if not (len(formats) == 1 and "gguf" in formats[0]):
-                raise NotImplementedError("Only one GGUF format can be set at a time.")
-            target_format: str = formats[0]
-
-        else:
-            target_format = self.scheme.lower()
-
-        tie_word_embeddings: bool = getattr(getattr(self.model, "config", None), "tie_word_embeddings", True)
-        for name, module in self.model.named_modules():
-            if isinstance(module, torch.nn.Embedding):
-                key: str = "lm_head" if tie_word_embeddings else "embedding"
-                config: dict[str, Any] = GGUF_INNER_CONFIG[GGUF_CONFIG[target_format][key]]
-                self._apply_config_to_layer(name, config, True)
-
-        if not tie_word_embeddings:
-            lm_head_name: str = get_lm_head_name(self.model)
-            config: dict[str, Any] = GGUF_CONFIG[GGUF_CONFIG[target_format]["lm_head"]]
-            check_fixed_by_user = (
-                self.layer_config[lm_head_name].get("fixed_by_user", False)
-                if lm_head_name in self.layer_config
-                else None
-            )
-            self._apply_config_to_layer(lm_head_name, config, check_fixed_by_user=check_fixed_by_user)
-            return True
-
-        return False
-
-    def _apply_config_to_layer(
-        self,
-        layer_name: str,
-        config: dict[str, Any],
-        check_fixed_by_user: bool = False,
-    ) -> None:
-        """Applies GGUF quantization configuration to a given layer.
-
-        Args:
-            layer_name (str): Name of the layer to configure.
-            config (dict[str, Any]): GGUF layer configuration.
-            check_fixed_by_user (bool): If True, preserve user-defined settings.
-        """
-        act_bits: int = 16
-        scale_dtype: Any = self.scale_dtype
-        keys: list[str] = ["bits", "group_size", "super_bits", "super_group_size", "data_type", "sym"]
-
-        self.layer_config[layer_name] = self.layer_config.get(layer_name, {})
-
-        for key in keys:
-            if (
-                key in self.layer_config[layer_name]
-                and check_fixed_by_user
-                # and self.layer_config[layer_name].get("fixed_by_user", False)
-            ):
-                continue
-            self.layer_config[layer_name][key] = config.get(key)
-            setattr(get_module(self.model, layer_name), key, config.get(key))
-
-        self.layer_config[layer_name]["act_bits"] = act_bits
-        self.layer_config[layer_name]["scale_dtype"] = scale_dtype
-        setattr(get_module(self.model, layer_name), "act_bits", act_bits)
-        setattr(get_module(self.model, layer_name), "scale_dtype", scale_dtype)
 
     def _quantize_layer_via_rtn(self, name: str) -> None:
         """Quantizes a layer using RTN (Round-To-Nearest) if available.
@@ -1993,14 +1818,21 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         Returns:
         The quantized model and layer configurations.
         """
-        for n, m in self.model.named_modules():
+        for n, m in self.model.named_modules(): # TODO check if could removed
             m.tmp_name = n
         self._check_compatibility()
         formats = self.formats if hasattr(self, "formats") else None
         # It is best to modify the model structure in the quantize function and check the format,
         # because it may cause the gguf format to not be exported normally.
         self.model = _handle_moe_model(self.model, formats=formats)
-        self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config)
+        # self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config)
+        # TODO check scale_dtype
+        self.layer_config, self.has_qlayer_outside_block = (
+            self._prepare_layer_config(self.model, self.layer_config,self.scheme, self.scale_dtype,
+                                       self.supported_types,self.inner_supported_types,self.quant_block_list,
+                                       self.fp_layers,self.quant_lm_head,
+                                       enable_gguf_official_mixed=True,is_mllm=self.mllm))
+
         if not hasattr(self, "formats"):
             logger.warning("this API is deprecated, please use `quantize_and_save` instead")
         else:
@@ -2011,14 +1843,14 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
                     break
             if len(self.formats) == 1 and self.formats[0] == "fake":
                 only_gguf = False
-            if only_gguf:
-                self.layer_config, gguf_format_config = get_layer_config_by_gguf_format(
-                    self.layer_config, self.formats, self.model, model_type=ModelType.TEXT
-                )
-                if self.mllm:
-                    self.layer_config, gguf_format_config = get_layer_config_by_gguf_format(
-                        self.layer_config, self.formats, self.model, model_type=ModelType.MMPROJ
-                    )
+            # if only_gguf:
+            #     self.layer_config, gguf_format_config = get_layer_config_by_gguf_format(
+            #         self.layer_config, self.formats, self.model, model_type=ModelType.TEXT
+            #     )
+            #     if self.mllm:
+            #         self.layer_config, gguf_format_config = get_layer_config_by_gguf_format(
+            #             self.layer_config, self.formats, self.model, model_type=ModelType.MMPROJ
+            #         )
             # Determine if immediate packing is required
             formats = self.formats
             if (
@@ -2226,141 +2058,6 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
             del layer_input
             clear_memory(q_layer_input)
 
-    def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> bool:
-        """
-        Sets the layer-wise configuration based on the provided `layer_config`.
-        By default, only quantize layers in blocks.
-
-        Args:
-            layer_config (dict): The configuration dictionary for each layer containing various configuration options.
-
-        Returns:
-            bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head),
-                  otherwise returns False.
-        """
-        # Get the names of layers in quantization blocks
-        supported_types = self.supported_types
-        layers_in_blocks = get_layer_names_in_block(
-            model, supported_types, self.quant_block_list, self.inner_supported_types
-        )
-        # Process regex in layer_config
-        all_supported_layer_names = []
-        # List of configuration keys
-        keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",)
-
-        for n, m in model.named_modules():
-            # Delete previous configuration to avoid conflicts with prior tuning
-            for key in keys:
-                if hasattr(m, key):
-                    delattr(m, key)
-
-            if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types:
-                continue
-            all_supported_layer_names.append(n)
-
-        names_in_layer_config = list(layer_config.keys())
-        for name in names_in_layer_config:
-            if name in all_supported_layer_names:
-                continue
-            matched_names = []
-            for layer_name in all_supported_layer_names:
-                if re.search(re.compile(name), layer_name) is not None:
-                    matched_names.append(layer_name)
-            if len(matched_names) > 0:
-                val = layer_config[name]
-                layer_config.pop(name)
-                for match_name in matched_names:
-                    layer_config[match_name] = val
-            else:
-                tmp_m = get_module(model, name)
-                if not isinstance(tmp_m, torch.nn.Embedding):  # TODO not good code style
-                    raise ValueError(f"key {name} in layer_config is invalid, please have a double check")
-
-        has_qlayer_outside_block = False  # Flag to track if there are quantized layers outside blocks (e.g., lm-head)
-
-        # Iterate through all modules in the model
-        is_gguf = hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats)
-        for n, m in model.named_modules():
-            # Skip unsupported types
-            if type(m) not in supported_types and m.__class__.__name__ not in self.inner_supported_types:
-                if n in self.layer_config:
-                    if not isinstance(m, torch.nn.Embedding):
-                        logger.warning(f"{n} is not supported, layer_config {n}: {layer_config[n]} will be ignored.")
-                        layer_config.pop(n)
-                        continue
-                    if not is_gguf:
-                        if not check_to_quantized(layer_config[n]):
-                            layer_config.pop(n)
-                            continue
-                else:
-                    continue
-
-            # If the layer is not in the config and is part of a quantization block, use default configuration
-            if n not in layer_config.keys() and n in layers_in_blocks:
-                layer_config[n] = {}
-                for key in keys:
-                    layer_config[n][key] = getattr(self, key)
-
-            # If the layer is partially configured, fill in missing values
-            elif n in layer_config.keys():
-                if "data_type" in layer_config[n] and "bits" not in layer_config[n]:
-                    tmp_bits = infer_bits_by_data_type(layer_config[n]["data_type"])
-                    if tmp_bits is not None and tmp_bits != self.bits:
-                        logger.warning(
-                            f"'data_type' do not match the specified 'bits' setting for {n}."
-                            f" Resetting 'bits' to {tmp_bits}."
-                        )
-                        layer_config[n]["bits"] = tmp_bits
-                if "act_data_type" in layer_config[n] and "act_bits" not in layer_config[n]:
-                    tmp_bits = infer_bits_by_data_type(layer_config[n]["act_data_type"])
-                    if tmp_bits is not None and tmp_bits != self.act_bits:
-                        logger.warning(
-                            f"'act_data_type' do not match the specified 'act_bits' setting for {n}."
-                            f" Resetting 'act_bits' to {tmp_bits}."
-                        )
-                        layer_config[n]["act_bits"] = tmp_bits
-
-                for key in keys:
-                    if key not in layer_config[n].keys():
-                        layer_config[n][key] = getattr(self, key)
-                layer_config[n]["fixed_by_user"] = True
-
-            # If the layer is not in the config and not part of a quantization block,
-            # use default configuration and set specific values
-            else:
-                layer_config[n] = {}
-                for key in keys:
-                    layer_config[n][key] = getattr(self, key)
-                layer_config[n]["bits"] = 16
-                layer_config[n]["act_bits"] = 16
-
-            if n in layers_in_blocks:
-                layer_config[n]["in_blocks"] = True
-            else:
-                layer_config[n]["in_blocks"] = False
-
-            # If the layer is outside a block and requires quantization, mark it as a quantized layer outside the block
-            if (
-                n not in layers_in_blocks
-                and check_to_quantized(layer_config[n])
-                and not isinstance(m, torch.nn.Embedding)
-            ):
-                has_qlayer_outside_block = True
-
-            in_features, out_features = get_layer_features(m)
-            if in_features <= layer_config[n]["group_size"]:
-                layer_config[n]["group_size"] = -1
-
-            # Apply the configuration to the corresponding layer in the model
-            for key in keys:
-                setattr(m, key, layer_config[n][key])
-        need_to_quantize_lm_head = self._check_need_to_quantize_lm_head_embedding()
-        if need_to_quantize_lm_head:
-            has_qlayer_outside_block = True
-
-        # Return whether there are quantized layers outside the blocks
-        return has_qlayer_outside_block
-
     @torch.no_grad()
     def _get_block_outputs(
         self,
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index 97a3cdf02..f6ca0cc98 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -16,7 +16,7 @@
 from dataclasses import dataclass, fields
 from typing import Iterable, Optional, Union
 
-__all__ = ["QuantizationScheme", "is_gguf_scheme", "preset_name_to_scheme", "AutoScheme"]
+__all__ = ["QuantizationScheme", "get_gguf_scheme", "preset_name_to_scheme", "AutoScheme"]
 
 
 @dataclass
@@ -236,15 +236,15 @@ def is_preset_scheme(name: str) -> bool:
     PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value)
 
 
-def is_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> bool:
+def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> bool:
     if isinstance(scheme, str) and scheme.upper().startswith("GGUF"):
         return True
     for key, val in PRESET_SCHEMES.items():
         if not key.upper().startswith("GGUF"):
             continue
         if val == scheme:
-            return True
-    return False
+            return key
+    return None
 
 
 @dataclass
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 575b8e3e8..3d35d303e 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -766,8 +766,9 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
 
 
 def get_layer_names_in_block(
-    model, supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D), quant_block_list=None, class_names=None
-):
+    model:torch.nn.Module, supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D),
+        quant_block_list:list=None, class_names:tuple=None
+) -> list[str]:
     """Retrieves the names of layers within each block of the model.
 
     Returns:
@@ -778,7 +779,7 @@ def get_layer_names_in_block(
         class_names = []
     for n, m in model.named_modules():
         if type(m) in supported_types or (class_names is not None and m.__class__.__name__ in class_names):
-            m.tmp_name = n
+            m.backup_name = n
     layers_in_block = []
     if bool(quant_block_list):
         all_blocks = quant_block_list
@@ -788,8 +789,9 @@ def get_layer_names_in_block(
         for block_name in block_names:
             block = get_module(model, block_name)
             for n, m in block.named_modules():
-                if hasattr(m, "tmp_name"):
-                    layers_in_block.append(m.tmp_name)
+                if hasattr(m, "backup_name"):
+                    layers_in_block.append(m.backup_name)
+                    delattr(m, "backup_name")
     return layers_in_block
 
 
@@ -1840,9 +1842,9 @@ def _gguf_type_fallback(gguf_type):
 
 
 ##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129
-def get_layer_config_by_gguf_format(layer_config, gguf_format, model, model_type=ModelType.TEXT):
-    # TODO: support for other format later
-    target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None)
+def get_layer_config_by_gguf_format(layer_config, target_gguf_format:str, model, model_type=ModelType.TEXT):
+    # # TODO: support for other format later
+    # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None)
 
     import gguf  # pylint: disable=E0401
 

From f027801f450d8036e4a98d09a4b6cc3826a7fb21 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Sep 2025 06:33:01 +0000
Subject: [PATCH 22/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/__main__.py         |  1 -
 auto_round/compressors/base.py | 78 ++++++++++++++++------------------
 auto_round/utils.py            |  8 ++--
 3 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 43f55a050..97e3eb6ff 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -478,7 +478,6 @@ def tune(args):
     #     layer_config[item[0]] = {}
     #     layer_config[item[0]]["bits"] = item[1]
 
-
     autoround: BaseCompressor = AutoRound(
         model=model_name,
         scheme=scheme,
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 6178623f0..c5928da60 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -35,7 +35,7 @@
 from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType
 from auto_round.logger import logger
 from auto_round.low_cpu_mem.utils import get_layers_before_block
-from auto_round.schemes import AutoScheme, QuantizationScheme, preset_name_to_scheme, get_gguf_scheme
+from auto_round.schemes import AutoScheme, QuantizationScheme, get_gguf_scheme, preset_name_to_scheme
 from auto_round.sign_sgd import SignSGD
 from auto_round.special_model_handler import _handle_moe_model
 from auto_round.utils import (
@@ -414,18 +414,18 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
     def _prepare_layer_config(
-            self,
-            model: torch.nn.Module,
-            layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
-            default_scheme: "QuantizationScheme",
-            default_scale_dtype: torch.dtype | str,
-            supported_types: tuple,
-            inner_supported_types: tuple,
-            quant_block_list=None,
-            fp_layers: str = "",
-            quant_lm_head: bool = False,
-            enable_gguf_official_mixed: bool = True,
-            is_mllm: bool = False,
+        self,
+        model: torch.nn.Module,
+        layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
+        default_scheme: "QuantizationScheme",
+        default_scale_dtype: torch.dtype | str,
+        supported_types: tuple,
+        inner_supported_types: tuple,
+        quant_block_list=None,
+        fp_layers: str = "",
+        quant_lm_head: bool = False,
+        enable_gguf_official_mixed: bool = True,
+        is_mllm: bool = False,
     ) -> tuple[dict, bool]:
         """
         Normalize, validate, and expand layer-specific quantization configs.
@@ -452,8 +452,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
                 invalid = set(item) - set(scheme_keys)
                 if invalid:
                     raise ValueError(
-                        f"Invalid keys {invalid} in layer_config for '{layer_name}'. "
-                        f"Allowed keys: {scheme_keys}"
+                        f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}"
                     )
                 config = dict(item)
             else:
@@ -472,10 +471,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
 
         # 1. fp_layers -> force 16
         for name in get_fp_layer_names(self.model, fp_layers):
-            layer_config[name] = {
-                "bits": 16, "act_bits": 16,
-                "data_type": "float", "act_data_type": "float"
-            }
+            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"}
 
         # 2. normalize
         layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()}
@@ -528,17 +524,15 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
         # 7. lm_head
         lm_head_name = get_lm_head_name(model)
         tied_lm_head = False
-        if (
-            hasattr(model, "config")
-            and model.config.tie_word_embeddings
-            and hasattr(model, "_tied_weights_keys")
-        ):
-            tied_keys =model._tied_weights_keys
+        if hasattr(model, "config") and model.config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
+            tied_keys = model._tied_weights_keys
             if lm_head_name in tied_keys:
-                tied_lm_head=True
+                tied_lm_head = True
         if quant_lm_head and tied_lm_head:
-            quant_lm_head=False
-            logger.warning("reset `quant_lm_head` to false as quantizing lm_head with tied weights has not been supported currently")
+            quant_lm_head = False
+            logger.warning(
+                "reset `quant_lm_head` to false as quantizing lm_head with tied weights has not been supported currently"
+            )
 
         if lm_head_name not in layer_config and quant_lm_head:
             layer_config[lm_head_name] = default_dict.copy()
@@ -589,8 +583,6 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
         dispatch_layer_config(layer_config)
         return layer_config, has_qlayer_outside_block
 
-
-
     def _parse_layer_config(
         self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers: str
     ) -> None:
@@ -651,7 +643,7 @@ def _parse_layer_config(
             if key not in lm_head_layer_config:
                 lm_head_layer_config[key] = getattr(self, key)
 
-    def  _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme:
+    def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme:
         """Parse and set the quantization scheme."""
         if isinstance(scheme, QuantizationScheme):
             scheme = asdict(scheme)
@@ -1023,7 +1015,6 @@ def remove_duplicates(lst):
                     )
                     formats[i] = gguf_format_name.lower()
 
-
         _gguf_args_check(self, formats, model_type=ModelType.TEXT)
         if self.mllm:
             _gguf_args_check(self, formats, model_type=ModelType.MMPROJ)
@@ -1033,8 +1024,6 @@ def remove_duplicates(lst):
                 self.scheme = preset_name_to_scheme(f)
                 break
 
-
-
         for format_ in formats:
             if format_ not in SUPPORTED_FORMATS:
                 logger.error(f"Unsupported format {format_}, please choose from {SUPPORTED_FORMATS}")
@@ -1518,7 +1507,6 @@ def get_imatrix_hook(module, input, output):
             for hook in hooks:
                 hook.remove()
 
-
     def _quantize_layer_via_rtn(self, name: str) -> None:
         """Quantizes a layer using RTN (Round-To-Nearest) if available.
 
@@ -1818,7 +1806,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         Returns:
         The quantized model and layer configurations.
         """
-        for n, m in self.model.named_modules(): # TODO check if could removed
+        for n, m in self.model.named_modules():  # TODO check if could removed
             m.tmp_name = n
         self._check_compatibility()
         formats = self.formats if hasattr(self, "formats") else None
@@ -1827,11 +1815,19 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         self.model = _handle_moe_model(self.model, formats=formats)
         # self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config)
         # TODO check scale_dtype
-        self.layer_config, self.has_qlayer_outside_block = (
-            self._prepare_layer_config(self.model, self.layer_config,self.scheme, self.scale_dtype,
-                                       self.supported_types,self.inner_supported_types,self.quant_block_list,
-                                       self.fp_layers,self.quant_lm_head,
-                                       enable_gguf_official_mixed=True,is_mllm=self.mllm))
+        self.layer_config, self.has_qlayer_outside_block = self._prepare_layer_config(
+            self.model,
+            self.layer_config,
+            self.scheme,
+            self.scale_dtype,
+            self.supported_types,
+            self.inner_supported_types,
+            self.quant_block_list,
+            self.fp_layers,
+            self.quant_lm_head,
+            enable_gguf_official_mixed=True,
+            is_mllm=self.mllm,
+        )
 
         if not hasattr(self, "formats"):
             logger.warning("this API is deprecated, please use `quantize_and_save` instead")
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 3d35d303e..cac86e397 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -766,8 +766,10 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
 
 
 def get_layer_names_in_block(
-    model:torch.nn.Module, supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D),
-        quant_block_list:list=None, class_names:tuple=None
+    model: torch.nn.Module,
+    supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D),
+    quant_block_list: list = None,
+    class_names: tuple = None,
 ) -> list[str]:
     """Retrieves the names of layers within each block of the model.
 
@@ -1842,7 +1844,7 @@ def _gguf_type_fallback(gguf_type):
 
 
 ##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129
-def get_layer_config_by_gguf_format(layer_config, target_gguf_format:str, model, model_type=ModelType.TEXT):
+def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model, model_type=ModelType.TEXT):
     # # TODO: support for other format later
     # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None)
 

From c6b78c6ad7276a2ba207d506e6857ba282b507ea Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 29 Sep 2025 14:56:09 +0800
Subject: [PATCH 23/35] tiny change

---
 auto_round/compressors/base.py | 41 +++++++++++-----------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index c5928da60..931f18197 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -413,10 +413,10 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
         else:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
-    def _prepare_layer_config(
+    def _set_layer_config(
         self,
         model: torch.nn.Module,
-        layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
+        layer_config: dict[str, str | dict | "QuantizationScheme"],
         default_scheme: "QuantizationScheme",
         default_scale_dtype: torch.dtype | str,
         supported_types: tuple,
@@ -523,15 +523,15 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
 
         # 7. lm_head
         lm_head_name = get_lm_head_name(model)
-        tied_lm_head = False
-        if hasattr(model, "config") and model.config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
-            tied_keys = model._tied_weights_keys
-            if lm_head_name in tied_keys:
-                tied_lm_head = True
-        if quant_lm_head and tied_lm_head:
+        tie_word_embeddings = False
+        if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"):
+            tie_word_embeddings = model.config.tie_word_embeddings
+
+        if quant_lm_head and tie_word_embeddings:
             quant_lm_head = False
             logger.warning(
-                "reset `quant_lm_head` to false as quantizing lm_head with tied weights has not been supported currently"
+                "reset `quant_lm_head` to false as quantizing "
+                "lm_head with tied weights has not been supported currently"
             )
 
         if lm_head_name not in layer_config and quant_lm_head:
@@ -566,7 +566,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
             return layer_config, has_qlayer_outside_block
 
         # embed + lm_head defaults for gguf
-        if lm_head_name not in layer_config and not tied_lm_head:
+        if lm_head_name not in layer_config and not tie_word_embeddings:
             cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]]
             cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype}
             layer_config[lm_head_name] = cfg
@@ -1813,9 +1813,9 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         # It is best to modify the model structure in the quantize function and check the format,
         # because it may cause the gguf format to not be exported normally.
         self.model = _handle_moe_model(self.model, formats=formats)
-        # self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config)
+
         # TODO check scale_dtype
-        self.layer_config, self.has_qlayer_outside_block = self._prepare_layer_config(
+        self.layer_config, self.has_qlayer_outside_block = self._set_layer_config(
             self.model,
             self.layer_config,
             self.scheme,
@@ -1832,21 +1832,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         if not hasattr(self, "formats"):
             logger.warning("this API is deprecated, please use `quantize_and_save` instead")
         else:
-            only_gguf = True
-            for format_ in self.formats:
-                if not ("gguf" in format_ or "fake" in format_):
-                    only_gguf = False
-                    break
-            if len(self.formats) == 1 and self.formats[0] == "fake":
-                only_gguf = False
-            # if only_gguf:
-            #     self.layer_config, gguf_format_config = get_layer_config_by_gguf_format(
-            #         self.layer_config, self.formats, self.model, model_type=ModelType.TEXT
-            #     )
-            #     if self.mllm:
-            #         self.layer_config, gguf_format_config = get_layer_config_by_gguf_format(
-            #             self.layer_config, self.formats, self.model, model_type=ModelType.MMPROJ
-            #         )
             # Determine if immediate packing is required
             formats = self.formats
             if (
@@ -1958,7 +1943,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         cost_time = end_time - self.start_time
         logger.info(f"quantization tuning time {cost_time}")
 
-        ## dump a summary
+        # Dump a summary
         quantized_layers = []
         unquantized_layers = []
         for n, m in self.model.named_modules():

From 1b9f24e8fde62ffa7f0e0e8321b69ae3c6d6479e Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 29 Sep 2025 14:57:58 +0800
Subject: [PATCH 24/35] tiny fix

---
 auto_round/schemes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index f6ca0cc98..8dde95430 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -236,7 +236,7 @@ def is_preset_scheme(name: str) -> bool:
     PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value)
 
 
-def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> bool:
+def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str:
     if isinstance(scheme, str) and scheme.upper().startswith("GGUF"):
         return True
     for key, val in PRESET_SCHEMES.items():

From 2c0075ae48c98d095ef68515af51f509c94af1be Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 29 Sep 2025 15:11:58 +0800
Subject: [PATCH 25/35] tmp change

---
 auto_round/__main__.py         | 2 +-
 auto_round/compressors/base.py | 6 ++++--
 auto_round/schemes.py          | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 97e3eb6ff..a25fc5421 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -110,7 +110,7 @@ def __init__(self, *args, **kwargs):
 
         self.add_argument(
             "--scale_dtype",
-            default="fp16",
+            default=None,
             choices=["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"],
             help="scale data type to use for quantization",
         )
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 931f18197..281efff46 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -202,6 +202,10 @@ def __init__(
             ... }
         """
         self.scheme = self._parse_and_set_scheme(scheme, kwargs)
+
+        gguf_scheme_name = get_gguf_scheme(self.scheme)
+        # GGUF uses fp32 scale dtype as default
+        scale_dtype = kwargs.pop("scale_dtype", "fp32") if gguf_scheme_name else kwargs.pop("scale_dtype", "fp16")
         # Extra/legacy kwargs for backward compatibility
         # Major version releases may pack them with extra configuration options
         amp = kwargs.pop("amp", True)
@@ -214,7 +218,6 @@ def __init__(
         sampler = kwargs.pop("sampler", "rand")
         not_use_best_mse = kwargs.pop("not_use_best_mse", False)
         dynamic_max_gap = kwargs.pop("dynamic_max_gap", -1)
-        scale_dtype = kwargs.pop("scale_dtype", "fp16")
         nblocks = kwargs.pop("nblocks", 1)
         low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
         to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None)
@@ -287,7 +290,6 @@ def __init__(
             self.device_map = None
         self._set_device_map_in_blocks(self.device_map)
 
-        # self._parse_layer_config(layer_config, fp_layers)  # Must place after model init
 
         self.to_quant_block_names = to_quant_block_names
 
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index 8dde95430..c5513d79a 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -244,7 +244,7 @@ def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str:
             continue
         if val == scheme:
             return key
-    return None
+    return ""
 
 
 @dataclass

From 97198f07b6fe660e6fba8b63c224ddb7440441d1 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 29 Sep 2025 15:40:49 +0800
Subject: [PATCH 26/35] tmp change

---
 auto_round/auto_schemes/utils.py   |  5 ++++
 auto_round/compressors/base.py     | 39 +++++++++---------------------
 auto_round/schemes.py              |  4 +--
 test/test_cuda/test_auto_scheme.py | 33 +++++++++++++++++++++++++
 4 files changed, 52 insertions(+), 29 deletions(-)
 create mode 100644 auto_round/auto_schemes/utils.py
 create mode 100644 test/test_cuda/test_auto_scheme.py

diff --git a/auto_round/auto_schemes/utils.py b/auto_round/auto_schemes/utils.py
new file mode 100644
index 000000000..fdcd343e7
--- /dev/null
+++ b/auto_round/auto_schemes/utils.py
@@ -0,0 +1,5 @@
+def get_total_bits(model, layer_config):
+    pass
+
+def get_bits(layer):
+    pass
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 281efff46..8477b2ed7 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -201,7 +201,10 @@ def __init__(
             ...     # ...
             ... }
         """
-        self.scheme = self._parse_and_set_scheme(scheme, kwargs)
+        if isinstance(scheme, AutoScheme): #TODO  AutoScheme could also be patched by group_size, etc
+            self.scheme = self._parse_and_set_scheme(scheme.options[0], kwargs)
+        else:
+            self.scheme = self._parse_and_set_scheme(scheme, kwargs)
 
         gguf_scheme_name = get_gguf_scheme(self.scheme)
         # GGUF uses fp32 scale dtype as default
@@ -271,6 +274,12 @@ def __init__(
         self.tokenizer = tokenizer
         self.shared_cache_keys = get_shared_keys(self.model)
 
+        self.to_quant_block_names = to_quant_block_names
+        if not hasattr(self, "quant_block_list"):
+            all_blocks = get_block_names(model)
+            self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names)
+
+
         if device is not None:
             logger.warning("`device` is deprecated, please use `device_map` instead")
 
@@ -290,9 +299,6 @@ def __init__(
             self.device_map = None
         self._set_device_map_in_blocks(self.device_map)
 
-
-        self.to_quant_block_names = to_quant_block_names
-
         # Set device, must place after model loading
         self._set_device(device_map)
 
@@ -342,27 +348,6 @@ def __init__(
         if self.static_kv_dtype is not None:
             logger.warning("The static kv is experimental and currently has limited support.")
 
-        # Model related
-        self.quantized = False
-        if isinstance(model, str):
-            model, tokenizer, low_cpu_mem_usage = llm_load_model(
-                model, device=device, low_cpu_mem_mode=low_cpu_mem_usage
-            )
-        elif tokenizer is None and iters > 0:
-            raise ValueError("A tokenizer must be set for non-str model input")
-        self.low_cpu_mem_usage = bool(low_cpu_mem_usage)
-        if unsupported_meta_device(model):
-            raise RuntimeError(
-                "AutoRound does not support parameters on meta device. "
-                "Please use more GPUs by setting `--device_map 0,1,2,3` or just place the model on CPU."
-            )
-        self.model = model.eval()
-        self.tokenizer = tokenizer
-        self.shared_cache_keys = get_shared_keys(self.model)
-        if not hasattr(self, "quant_block_list"):
-            all_blocks = get_block_names(model)
-            self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names)
-
         self.scale_dtype = convert_dtype_str2torch(scale_dtype)
         self._set_amp_dtype()
         self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device
@@ -418,7 +403,7 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
     def _set_layer_config(
         self,
         model: torch.nn.Module,
-        layer_config: dict[str, str | dict | "QuantizationScheme"],
+        layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
         default_scheme: "QuantizationScheme",
         default_scale_dtype: torch.dtype | str,
         supported_types: tuple,
@@ -558,7 +543,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
         for cfg in layer_config.values():
             if "in_blocks" not in cfg:
                 cfg["in_blocks"] = False
-            # 如果 layer 不在 blocks 且需要量化，则标记存在 blocks 外的量化层
+            # mark layer outside block
             if not cfg["in_blocks"] and check_to_quantized(cfg):
                 has_qlayer_outside_block = True
 
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index c5513d79a..ee12607eb 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -249,7 +249,7 @@ def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str:
 
 @dataclass
 class AutoScheme:
-    options: Optional[Iterable[QuantizationScheme]]
+    options: Optional[Iterable[QuantizationScheme|str]]
     target_bits: float
     shared_layers: Optional[Iterable[Iterable[str]]] = None
-    method: str = "naive_pre"
+    method: str = "default"
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
new file mode 100644
index 000000000..4fd2e9c8b
--- /dev/null
+++ b/test/test_cuda/test_auto_scheme.py
@@ -0,0 +1,33 @@
+import copy
+import re
+import shutil
+import sys
+import unittest
+
+sys.path.insert(0, "../..")
+import torch
+import transformers
+from lm_eval.utils import make_table  # pylint: disable=E0401
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from auto_round import AutoRound, AutoRoundConfig,AutoScheme
+from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model
+from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051
+
+class TestAutoScheme(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.save_dir = "./saved"
+        self.tasks = "lambada_openai"
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+
+
+    def test_auto_scheme(self):
+        model_name = "facebook/opt-125m"
+        scheme = AutoScheme(target_bits=3, options=("W2A16","W4A16","BF16"))
+        ar = AutoRound(model_name=model_name,scheme=scheme)
+        ar.quantize_and_save(self.save_dir)

From 27b4b4da882966b06ee750fe93e5fe4db3694bde Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Sep 2025 07:43:18 +0000
Subject: [PATCH 27/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/auto_schemes/utils.py   | 16 ++++++++++++++++
 auto_round/compressors/base.py     |  3 +--
 auto_round/schemes.py              |  2 +-
 test/test_cuda/test_auto_scheme.py |  8 ++++----
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/auto_round/auto_schemes/utils.py b/auto_round/auto_schemes/utils.py
index fdcd343e7..e01da9913 100644
--- a/auto_round/auto_schemes/utils.py
+++ b/auto_round/auto_schemes/utils.py
@@ -1,5 +1,21 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 def get_total_bits(model, layer_config):
     pass
 
+
 def get_bits(layer):
     pass
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 8477b2ed7..f7cd773d8 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -201,7 +201,7 @@ def __init__(
             ...     # ...
             ... }
         """
-        if isinstance(scheme, AutoScheme): #TODO  AutoScheme could also be patched by group_size, etc
+        if isinstance(scheme, AutoScheme):  # TODO  AutoScheme could also be patched by group_size, etc
             self.scheme = self._parse_and_set_scheme(scheme.options[0], kwargs)
         else:
             self.scheme = self._parse_and_set_scheme(scheme, kwargs)
@@ -279,7 +279,6 @@ def __init__(
             all_blocks = get_block_names(model)
             self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names)
 
-
         if device is not None:
             logger.warning("`device` is deprecated, please use `device_map` instead")
 
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index ee12607eb..38bed87e1 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -249,7 +249,7 @@ def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str:
 
 @dataclass
 class AutoScheme:
-    options: Optional[Iterable[QuantizationScheme|str]]
+    options: Optional[Iterable[QuantizationScheme | str]]
     target_bits: float
     shared_layers: Optional[Iterable[Iterable[str]]] = None
     method: str = "default"
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
index 4fd2e9c8b..6376e92c2 100644
--- a/test/test_cuda/test_auto_scheme.py
+++ b/test/test_cuda/test_auto_scheme.py
@@ -10,10 +10,11 @@
 from lm_eval.utils import make_table  # pylint: disable=E0401
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig,AutoScheme
+from auto_round import AutoRound, AutoRoundConfig, AutoScheme
 from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051
 
+
 class TestAutoScheme(unittest.TestCase):
     @classmethod
     def setUpClass(self):
@@ -25,9 +26,8 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-
     def test_auto_scheme(self):
         model_name = "facebook/opt-125m"
-        scheme = AutoScheme(target_bits=3, options=("W2A16","W4A16","BF16"))
-        ar = AutoRound(model_name=model_name,scheme=scheme)
+        scheme = AutoScheme(target_bits=3, options=("W2A16", "W4A16", "BF16"))
+        ar = AutoRound(model_name=model_name, scheme=scheme)
         ar.quantize_and_save(self.save_dir)

From 2d3095a05368e3c082861df49d0cff4c0b7855c2 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 29 Sep 2025 16:36:52 +0800
Subject: [PATCH 28/35] update

---
 auto_round/auto_schemes/gen_scheme.py |  19 ++
 auto_round/compressors/base.py        | 261 +++-----------------------
 auto_round/schemes.py                 |   2 +-
 auto_round/utils.py                   | 177 ++++++++++++++++-
 test/test_cuda/test_auto_scheme.py    |  11 +-
 5 files changed, 224 insertions(+), 246 deletions(-)
 create mode 100644 auto_round/auto_schemes/gen_scheme.py

diff --git a/auto_round/auto_schemes/gen_scheme.py b/auto_round/auto_schemes/gen_scheme.py
new file mode 100644
index 000000000..badf39742
--- /dev/null
+++ b/auto_round/auto_schemes/gen_scheme.py
@@ -0,0 +1,19 @@
+from typing import Union, Iterable
+
+import torch
+
+from auto_round import AutoScheme
+
+
+class GenScheme:
+    def __init__(self,
+                 auto_scheme: AutoScheme,
+                 model: torch.nn.Module,
+                 quant_layer_names: Iterable[str],
+                 fixed_layer_scheme:dict[str, dict],
+                 scale_dtype: str = "fp16",
+                 dataset="pile-10k"
+                 ):
+        pass
+
+
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index f7cd773d8..495cf3d04 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -93,7 +93,7 @@
     set_module,
     to_device,
     to_dtype,
-    unsupported_meta_device,
+    unsupported_meta_device, set_layer_config,
 )
 from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
 
@@ -236,6 +236,9 @@ def __init__(
         self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
         self.fp_layers = kwargs.pop("fp_layers", "")
         self.layer_config = layer_config
+        self.supported_types = SUPPORTED_LAYER_TYPES
+        self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
+        self.scale_dtype = convert_dtype_str2torch(scale_dtype)
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
@@ -285,6 +288,28 @@ def __init__(
         if device_map is None:
             device_map = 0
 
+        if isinstance(scheme, AutoScheme):
+            if self.mllm:
+                logger.info("AutoScheme with MLLM is not supported yet.")
+                sys.exit(1)
+            layer_config,_ = set_layer_config(self.model,
+            self.layer_config,
+            self.scheme,
+            self.scale_dtype,
+            self.supported_types,
+            self.inner_supported_types,
+            self.quant_block_list,
+            self.fp_layers,
+            self.quant_lm_head,
+            enable_gguf_official_mixed=False,
+            is_mllm=self.mllm)
+            quant_layer_names = layer_config.keys()
+            fixed_layer_scheme = {k: v for k, v in layer_config.items() if v.get("fixed_by_user", False)}
+            # mainly using quant_layers and fixed by users
+            from auto_round.auto_schemes.gen_scheme import GenScheme
+            gen_scheme = GenScheme(scheme,self.model,quant_layer_names,fixed_layer_scheme, self.scale_dtype, self.dataset)
+
+
         # Set device, must place after model loading
         self._set_device(device_map)
 
@@ -347,7 +372,6 @@ def __init__(
         if self.static_kv_dtype is not None:
             logger.warning("The static kv is experimental and currently has limited support.")
 
-        self.scale_dtype = convert_dtype_str2torch(scale_dtype)
         self._set_amp_dtype()
         self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device
         if self.act_bits <= 8 and self.amp_dtype == torch.float16:
@@ -359,8 +383,6 @@ def __init__(
             logger.info(f"using {self.model.dtype} for quantization tuning")
 
         # Some helpers
-        self.supported_types = SUPPORTED_LAYER_TYPES
-        self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
         if "hpu" in str(self.device):
             self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear")
         self.batch_dim = None
@@ -399,235 +421,6 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
         else:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
-    def _set_layer_config(
-        self,
-        model: torch.nn.Module,
-        layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
-        default_scheme: "QuantizationScheme",
-        default_scale_dtype: torch.dtype | str,
-        supported_types: tuple,
-        inner_supported_types: tuple,
-        quant_block_list=None,
-        fp_layers: str = "",
-        quant_lm_head: bool = False,
-        enable_gguf_official_mixed: bool = True,
-        is_mllm: bool = False,
-    ) -> tuple[dict, bool]:
-        """
-        Normalize, validate, and expand layer-specific quantization configs.
-        Returns (final_layer_config, has_quant_layer_outside_block)
-        """
-
-        from auto_round.schemes import get_gguf_scheme
-
-        # ---- helpers -------------------------------------------------
-        def dispatch_layer_config(layer_config: dict[str, dict]) -> None:
-            """Assign scheme values as attributes to matched modules."""
-            for layer_name, scheme in layer_config.items():
-                module = get_module(model, layer_name)
-                for attr, value in scheme.items():
-                    setattr(module, attr, value)
-
-        def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict:
-            """Convert config entry into dict and validate keys."""
-            if isinstance(item, str):
-                config = asdict(preset_name_to_scheme(item.upper()))
-            elif isinstance(item, QuantizationScheme):
-                config = asdict(item)
-            elif isinstance(item, dict):
-                invalid = set(item) - set(scheme_keys)
-                if invalid:
-                    raise ValueError(
-                        f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}"
-                    )
-                config = dict(item)
-            else:
-                raise TypeError(
-                    f"Unsupported type for layer_config[{layer_name}]: {type(item)}. "
-                    f"Expected str, dict, or QuantizationScheme."
-                )
-            # Clean up
-            config = {k: v for k, v in config.items() if v is not None}
-            config["fixed_by_user"] = True
-            return config
-
-        # ---- main logic ----------------------------------------------
-        scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",)
-        layer_config = copy.deepcopy(layer_config) or {}
-
-        # 1. fp_layers -> force 16
-        for name in get_fp_layer_names(self.model, fp_layers):
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"}
-
-        # 2. normalize
-        layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()}
-
-        # 3. infer missing bits
-        for cfg in layer_config.values():
-            if "data_type" in cfg and "bits" not in cfg:
-                if (b := infer_bits_by_data_type(cfg["data_type"])) is not None:
-                    cfg["bits"] = b
-            if "act_data_type" in cfg and "act_bits" not in cfg:
-                if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None:
-                    cfg["act_bits"] = b
-
-        # 4. fill defaults
-        default_dict = asdict(default_scheme)
-        default_dict["scale_dtype"] = default_scale_dtype
-        for cfg in layer_config.values():
-            for key in scheme_keys:
-                cfg.setdefault(key, default_dict.get(key))
-
-        # 5. collect supported modules
-        gguf_name = get_gguf_scheme(default_scheme)
-        if gguf_name and torch.nn.Embedding not in supported_types:
-            supported_types = (*supported_types, torch.nn.Embedding)
-
-        all_layer_names, embedding_layer_names = [], []
-        for n, m in model.named_modules():
-            # cleanup stale attributes
-            for key in scheme_keys:
-                if hasattr(m, key):
-                    delattr(m, key)
-            if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types:
-                continue
-            all_layer_names.append(n)
-            if isinstance(m, torch.nn.Embedding):
-                embedding_layer_names.append(n)
-
-        # 6. expand regex configs
-        for name in list(layer_config.keys()):
-            if name in all_layer_names:
-                continue
-            regex = re.compile(name)
-            matched = [ln for ln in all_layer_names if regex.search(ln)]
-            if not matched:
-                raise ValueError(f"Invalid regex '{name}' in layer_config, no match found.")
-            val = layer_config.pop(name)
-            for match in matched:
-                layer_config[match] = val
-
-        # 7. lm_head
-        lm_head_name = get_lm_head_name(model)
-        tie_word_embeddings = False
-        if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"):
-            tie_word_embeddings = model.config.tie_word_embeddings
-
-        if quant_lm_head and tie_word_embeddings:
-            quant_lm_head = False
-            logger.warning(
-                "reset `quant_lm_head` to false as quantizing "
-                "lm_head with tied weights has not been supported currently"
-            )
-
-        if lm_head_name not in layer_config and quant_lm_head:
-            layer_config[lm_head_name] = default_dict.copy()
-
-        # 8. enforce shape divisibility for int weight-only
-        if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name:
-            for n, m in model.named_modules():
-                if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
-                    if m.weight.shape[0] % 32 or m.weight.shape[1] % 32:
-                        layer_config.setdefault(n, default_dict.copy())
-                        layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True})
-                        logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).")
-
-        # 9. block layers: mark as in_blocks=True
-        for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types):
-            cfg = layer_config.setdefault(name, default_dict.copy())
-            cfg["in_blocks"] = True
-
-        # ---- restore: ensure missing in_blocks are set to False and compute flag ----
-        has_qlayer_outside_block = False
-        for cfg in layer_config.values():
-            if "in_blocks" not in cfg:
-                cfg["in_blocks"] = False
-            # mark layer outside block
-            if not cfg["in_blocks"] and check_to_quantized(cfg):
-                has_qlayer_outside_block = True
-
-        # 10. GGUF handling
-        if not gguf_name:
-            dispatch_layer_config(layer_config)
-            return layer_config, has_qlayer_outside_block
-
-        # embed + lm_head defaults for gguf
-        if lm_head_name not in layer_config and not tie_word_embeddings:
-            cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]]
-            cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype}
-            layer_config[lm_head_name] = cfg
-            has_qlayer_outside_block = True
-        for emd_name in embedding_layer_names:
-            cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]]
-            cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype}
-            layer_config[emd_name] = cfg
-
-        if enable_gguf_official_mixed:
-            model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT
-            layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type)
-
-        dispatch_layer_config(layer_config)
-        return layer_config, has_qlayer_outside_block
-
-    def _parse_layer_config(
-        self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers: str
-    ) -> None:
-        """Parse and set the layer-wise quantization configuration."""
-        not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers)
-        if len(not_quantize_layer_names) > 0:
-            logger.info(f"{not_quantize_layer_names} will not be quantized.")
-        if layer_config is None:
-            layer_config = {}
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"}
-
-        # Some other quantization configs
-        self.layer_config = copy.deepcopy(layer_config) if layer_config is not None else {}
-        scheme_keys = {f.name for f in fields(QuantizationScheme)}
-
-        for key, item in self.layer_config.items():
-            if isinstance(item, str):
-                config = asdict(preset_name_to_scheme(item.upper()))
-            elif isinstance(item, QuantizationScheme):
-                config = asdict(item)
-            elif isinstance(item, dict):
-                invalid_keys = set(item) - scheme_keys
-                if invalid_keys:
-                    raise ValueError(
-                        f"Invalid keys {invalid_keys} in layer_config for layer '{key}', "
-                        f"only {scheme_keys} are supported"
-                    )
-                config = dict(item)
-
-            # Drop None values
-            config = {k: v for k, v in config.items() if v is not None}
-            self.layer_config[key] = config
-
-        if not self.quant_lm_head or (isinstance(self.scheme, str) and self.scheme.lower().startswith("gguf")):
-            return
-        for n, _ in self.model.named_modules():
-            lm_head_layer_name = n
-
-        if (
-            hasattr(self.model, "config")
-            and self.model.config.tie_word_embeddings
-            and hasattr(self.model, "_tied_weights_keys")
-        ):
-            tied_keys = self.model._tied_weights_keys
-            for item in tied_keys:
-                if lm_head_layer_name in item:  # TODO extend to encoder-decoder layer, seq classification model
-                    self.quant_lm_head = False
-                    logger.warning(
-                        "reset `quant_lm_head` to `False` as quantizing lm_head with tied weights has not been "
-                        "supported currently"
-                    )
-                    break
-
-        lm_head_layer_config = self.layer_config[lm_head_layer_name] if lm_head_layer_name in self.layer_config else {}
-
-        for key in scheme_keys:
-            if key not in lm_head_layer_config:
-                lm_head_layer_config[key] = getattr(self, key)
 
     def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme:
         """Parse and set the quantization scheme."""
@@ -1801,7 +1594,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         self.model = _handle_moe_model(self.model, formats=formats)
 
         # TODO check scale_dtype
-        self.layer_config, self.has_qlayer_outside_block = self._set_layer_config(
+        self.layer_config, self.has_qlayer_outside_block = set_layer_config(
             self.model,
             self.layer_config,
             self.scheme,
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index 38bed87e1..cf7d4d433 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -250,6 +250,6 @@ def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str:
 @dataclass
 class AutoScheme:
     options: Optional[Iterable[QuantizationScheme | str]]
-    target_bits: float
+    avg_bits: float
     shared_layers: Optional[Iterable[Iterable[str]]] = None
     method: str = "default"
diff --git a/auto_round/utils.py b/auto_round/utils.py
index cac86e397..92e54a3ba 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -21,7 +21,7 @@
 import re
 import sys
 from collections import UserDict
-from dataclasses import fields
+from dataclasses import fields, asdict
 from enum import Enum
 from functools import lru_cache
 from pathlib import Path
@@ -35,7 +35,7 @@
 
 from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType
 from auto_round.logger import logger
-from auto_round.schemes import QuantizationScheme
+from auto_round.schemes import QuantizationScheme, preset_name_to_scheme
 
 SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings")
 
@@ -2742,3 +2742,176 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
                 return True
 
     return False
+
+
+
+def set_layer_config(
+    model: torch.nn.Module,
+    layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
+    default_scheme: "QuantizationScheme",
+    default_scale_dtype: torch.dtype | str,
+    supported_types: tuple,
+    inner_supported_types: tuple,
+    quant_block_list=None,
+    fp_layers: str = "",
+    quant_lm_head: bool = False,
+    enable_gguf_official_mixed: bool = True,
+    is_mllm: bool = False,
+) -> tuple[dict, bool]:
+    """
+    Normalize, validate, and expand layer-specific quantization configs.
+    Returns (final_layer_config, has_quant_layer_outside_block)
+    """
+
+    from auto_round.schemes import get_gguf_scheme
+
+    # ---- helpers -------------------------------------------------
+    def dispatch_layer_config(layer_config: dict[str, dict]) -> None:
+        """Assign scheme values as attributes to matched modules."""
+        for layer_name, scheme in layer_config.items():
+            module = get_module(model, layer_name)
+            for attr, value in scheme.items():
+                setattr(module, attr, value)
+
+    def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict:
+        """Convert config entry into dict and validate keys."""
+        if isinstance(item, str):
+            config = asdict(preset_name_to_scheme(item.upper()))
+        elif isinstance(item, QuantizationScheme):
+            config = asdict(item)
+        elif isinstance(item, dict):
+            invalid = set(item) - set(scheme_keys)
+            if invalid:
+                raise ValueError(
+                    f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}"
+                )
+            config = dict(item)
+        else:
+            raise TypeError(
+                f"Unsupported type for layer_config[{layer_name}]: {type(item)}. "
+                f"Expected str, dict, or QuantizationScheme."
+            )
+        # Clean up
+        config = {k: v for k, v in config.items() if v is not None}
+        config["fixed_by_user"] = True
+        return config
+
+    # ---- main logic ----------------------------------------------
+    scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",)
+    layer_config = copy.deepcopy(layer_config) or {}
+
+    # 1. fp_layers -> force 16
+    for name in get_fp_layer_names(model, fp_layers):
+        layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float","fixed_by_user":True}
+
+    # 2. normalize
+    layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()}
+
+    # 3. infer missing bits
+    for cfg in layer_config.values():
+        if "data_type" in cfg and "bits" not in cfg:
+            if (b := infer_bits_by_data_type(cfg["data_type"])) is not None:
+                cfg["bits"] = b
+        if "act_data_type" in cfg and "act_bits" not in cfg:
+            if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None:
+                cfg["act_bits"] = b
+
+    # 4. fill defaults
+    default_dict = asdict(default_scheme)
+    default_dict["scale_dtype"] = default_scale_dtype
+    for cfg in layer_config.values():
+        for key in scheme_keys:
+            cfg.setdefault(key, default_dict.get(key))
+
+    # 5. collect supported modules
+    gguf_name = get_gguf_scheme(default_scheme)
+    if gguf_name and torch.nn.Embedding not in supported_types:
+        supported_types = (*supported_types, torch.nn.Embedding)
+
+    all_layer_names, embedding_layer_names = [], []
+    for n, m in model.named_modules():
+        # cleanup stale attributes
+        for key in scheme_keys:
+            if hasattr(m, key):
+                delattr(m, key)
+        if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types:
+            continue
+        all_layer_names.append(n)
+        if isinstance(m, torch.nn.Embedding):
+            embedding_layer_names.append(n)
+
+    # 6. expand regex configs
+    for name in list(layer_config.keys()):
+        if name in all_layer_names:
+            continue
+        regex = re.compile(name)
+        matched = [ln for ln in all_layer_names if regex.search(ln)]
+        if not matched:
+            raise ValueError(f"Invalid '{name}' in layer_config, no match found.")
+        val = layer_config.pop(name)
+        for match in matched:
+            layer_config[match] = val
+
+    # 7. lm_head
+    lm_head_name = get_lm_head_name(model)
+    tie_word_embeddings = False
+    if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"):
+        tie_word_embeddings = model.config.tie_word_embeddings
+
+    if quant_lm_head and tie_word_embeddings:
+        quant_lm_head = False
+        logger.warning(
+            "reset `quant_lm_head` to false as quantizing "
+            "lm_head with tied weights has not been supported currently"
+        )
+
+    if lm_head_name not in layer_config and quant_lm_head:
+        layer_config[lm_head_name] = default_dict.copy()
+
+    # 8. enforce shape divisibility for int weight-only
+    if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name:
+        for n, m in model.named_modules():
+            if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
+                if m.weight.shape[0] % 32 or m.weight.shape[1] % 32:
+                    layer_config.setdefault(n, default_dict.copy())
+                    layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True})
+                    logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).")
+
+    # 9. block layers: mark as in_blocks=True
+    for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types):
+        if name not in layer_config:
+            layer_config[name] = default_dict.copy()
+            layer_config[name]["fixed_by_user"]=False
+        layer_config[name]["in_blocks"] = True
+
+    # ---- restore: ensure missing in_blocks are set to False and compute flag ----
+    has_qlayer_outside_block = False
+    for cfg in layer_config.values():
+        if "in_blocks" not in cfg:
+            cfg["in_blocks"] = False
+        # mark layer outside block
+        if not cfg["in_blocks"] and check_to_quantized(cfg):
+            has_qlayer_outside_block = True
+
+    # 10. GGUF handling
+    if not gguf_name:
+        dispatch_layer_config(layer_config)
+        return layer_config, has_qlayer_outside_block
+
+    # embed + lm_head defaults for gguf
+    if lm_head_name not in layer_config and not tie_word_embeddings:
+        cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]]
+        cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype}
+        layer_config[lm_head_name] = cfg
+        has_qlayer_outside_block = True
+    for emd_name in embedding_layer_names:
+        cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]]
+        cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype}
+        layer_config[emd_name] = cfg
+
+    if enable_gguf_official_mixed:
+        model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT
+        layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type)
+
+    dispatch_layer_config(layer_config)
+    return layer_config, has_qlayer_outside_block
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
index 6376e92c2..b4f5e6041 100644
--- a/test/test_cuda/test_auto_scheme.py
+++ b/test/test_cuda/test_auto_scheme.py
@@ -5,15 +5,8 @@
 import unittest
 
 sys.path.insert(0, "../..")
-import torch
-import transformers
-from lm_eval.utils import make_table  # pylint: disable=E0401
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig, AutoScheme
-from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model
-from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051
-
 
 class TestAutoScheme(unittest.TestCase):
     @classmethod
@@ -28,6 +21,6 @@ def tearDownClass(self):
 
     def test_auto_scheme(self):
         model_name = "facebook/opt-125m"
-        scheme = AutoScheme(target_bits=3, options=("W2A16", "W4A16", "BF16"))
-        ar = AutoRound(model_name=model_name, scheme=scheme)
+        scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16"))
+        ar = AutoRound(model=model_name, scheme=scheme, iters=1, nsamples=1)
         ar.quantize_and_save(self.save_dir)

From 35a298b0f30c57df5e5af1808d3330538371c237 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Sep 2025 08:37:38 +0000
Subject: [PATCH 29/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/auto_schemes/gen_scheme.py | 35 ++++++++++++++++++---------
 auto_round/compressors/base.py        | 32 +++++++++++++-----------
 auto_round/utils.py                   | 16 +++++++-----
 test/test_cuda/test_auto_scheme.py    |  1 +
 4 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/auto_round/auto_schemes/gen_scheme.py b/auto_round/auto_schemes/gen_scheme.py
index badf39742..ba6b0a679 100644
--- a/auto_round/auto_schemes/gen_scheme.py
+++ b/auto_round/auto_schemes/gen_scheme.py
@@ -1,4 +1,18 @@
-from typing import Union, Iterable
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Iterable, Union
 
 import torch
 
@@ -6,14 +20,13 @@
 
 
 class GenScheme:
-    def __init__(self,
-                 auto_scheme: AutoScheme,
-                 model: torch.nn.Module,
-                 quant_layer_names: Iterable[str],
-                 fixed_layer_scheme:dict[str, dict],
-                 scale_dtype: str = "fp16",
-                 dataset="pile-10k"
-                 ):
+    def __init__(
+        self,
+        auto_scheme: AutoScheme,
+        model: torch.nn.Module,
+        quant_layer_names: Iterable[str],
+        fixed_layer_scheme: dict[str, dict],
+        scale_dtype: str = "fp16",
+        dataset="pile-10k",
+    ):
         pass
-
-
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 495cf3d04..66c79274f 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -90,10 +90,11 @@
     mv_module_from_gpu,
     reset_params,
     set_amax_for_all_moe_layers,
+    set_layer_config,
     set_module,
     to_device,
     to_dtype,
-    unsupported_meta_device, set_layer_config,
+    unsupported_meta_device,
 )
 from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
 
@@ -292,23 +293,27 @@ def __init__(
             if self.mllm:
                 logger.info("AutoScheme with MLLM is not supported yet.")
                 sys.exit(1)
-            layer_config,_ = set_layer_config(self.model,
-            self.layer_config,
-            self.scheme,
-            self.scale_dtype,
-            self.supported_types,
-            self.inner_supported_types,
-            self.quant_block_list,
-            self.fp_layers,
-            self.quant_lm_head,
-            enable_gguf_official_mixed=False,
-            is_mllm=self.mllm)
+            layer_config, _ = set_layer_config(
+                self.model,
+                self.layer_config,
+                self.scheme,
+                self.scale_dtype,
+                self.supported_types,
+                self.inner_supported_types,
+                self.quant_block_list,
+                self.fp_layers,
+                self.quant_lm_head,
+                enable_gguf_official_mixed=False,
+                is_mllm=self.mllm,
+            )
             quant_layer_names = layer_config.keys()
             fixed_layer_scheme = {k: v for k, v in layer_config.items() if v.get("fixed_by_user", False)}
             # mainly using quant_layers and fixed by users
             from auto_round.auto_schemes.gen_scheme import GenScheme
-            gen_scheme = GenScheme(scheme,self.model,quant_layer_names,fixed_layer_scheme, self.scale_dtype, self.dataset)
 
+            gen_scheme = GenScheme(
+                scheme, self.model, quant_layer_names, fixed_layer_scheme, self.scale_dtype, self.dataset
+            )
 
         # Set device, must place after model loading
         self._set_device(device_map)
@@ -421,7 +426,6 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
         else:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
-
     def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme:
         """Parse and set the quantization scheme."""
         if isinstance(scheme, QuantizationScheme):
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 92e54a3ba..e865726b8 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -21,7 +21,7 @@
 import re
 import sys
 from collections import UserDict
-from dataclasses import fields, asdict
+from dataclasses import asdict, fields
 from enum import Enum
 from functools import lru_cache
 from pathlib import Path
@@ -2744,7 +2744,6 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
     return False
 
 
-
 def set_layer_config(
     model: torch.nn.Module,
     layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
@@ -2802,7 +2801,13 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
 
     # 1. fp_layers -> force 16
     for name in get_fp_layer_names(model, fp_layers):
-        layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float","fixed_by_user":True}
+        layer_config[name] = {
+            "bits": 16,
+            "act_bits": 16,
+            "data_type": "float",
+            "act_data_type": "float",
+            "fixed_by_user": True,
+        }
 
     # 2. normalize
     layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()}
@@ -2861,8 +2866,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
     if quant_lm_head and tie_word_embeddings:
         quant_lm_head = False
         logger.warning(
-            "reset `quant_lm_head` to false as quantizing "
-            "lm_head with tied weights has not been supported currently"
+            "reset `quant_lm_head` to false as quantizing " "lm_head with tied weights has not been supported currently"
         )
 
     if lm_head_name not in layer_config and quant_lm_head:
@@ -2881,7 +2885,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
     for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types):
         if name not in layer_config:
             layer_config[name] = default_dict.copy()
-            layer_config[name]["fixed_by_user"]=False
+            layer_config[name]["fixed_by_user"] = False
         layer_config[name]["in_blocks"] = True
 
     # ---- restore: ensure missing in_blocks are set to False and compute flag ----
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
index b4f5e6041..b9fffdee9 100644
--- a/test/test_cuda/test_auto_scheme.py
+++ b/test/test_cuda/test_auto_scheme.py
@@ -8,6 +8,7 @@
 
 from auto_round import AutoRound, AutoRoundConfig, AutoScheme
 
+
 class TestAutoScheme(unittest.TestCase):
     @classmethod
     def setUpClass(self):

From 4a594cd5e778b2c5897e3131d72e21c7e46ba74f Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 29 Sep 2025 20:37:42 +0800
Subject: [PATCH 30/35] fix

---
 auto_round/compressors/base.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 66c79274f..955971306 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -725,20 +725,20 @@ def _check_compatibility(self) -> None:
                     " We are likely to release new algorithm for certain configurations in the future."
                 )
 
-        # Check group_size 32 for auto_round
-        if (
-            self.data_type == "int"
-            and hasattr(self, "formats")
-            and any(key in fmt for fmt in self.formats for key in ("auto_round", "auto_gptq", "auto_awq"))
-        ):
-            for n, m in self.model.named_modules():
-                if type(m) in self.supported_types:
-                    if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                        self.layer_config[n] = {"bits": 16}
-                        logger.info(
-                            f"{n} will not be quantized due to its shape not being divisible by 32,"
-                            " resulting in an exporting issue to autogptq"
-                        )
+        # # Check group_size 32 for auto_round
+        # if (
+        #     self.data_type == "int"
+        #     and hasattr(self, "formats")
+        #     and any(key in fmt for fmt in self.formats for key in ("auto_round", "auto_gptq", "auto_awq"))
+        # ):
+        #     for n, m in self.model.named_modules():
+        #         if type(m) in self.supported_types:
+        #             if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+        #                 self.layer_config[n] = {"bits": 16}
+        #                 logger.info(
+        #                     f"{n} will not be quantized due to its shape not being divisible by 32,"
+        #                     " resulting in an exporting issue to autogptq"
+        #                 )
 
         if (
             self.seqlen is not None

From dcd08d629cc3840efdc24b8c9af97af2edf71095 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 30 Sep 2025 14:02:23 +0800
Subject: [PATCH 31/35] fix uts, still one left

---
 .../export/export_to_autoround/export_to_nvfp_mxfp.py    | 4 ++--
 auto_round/schemes.py                                    | 9 +++++++--
 auto_round/utils.py                                      | 5 +++++
 test/test_cpu/test_autoround.py                          | 1 +
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
index c4a02f673..240a94899 100644
--- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
+++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -174,7 +174,7 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs):
         for n, m in model.named_modules():
             if type(m) in SUPPORTED_LAYER_TYPES:
                 layer = m
-                if layer.act_bits < 8 and not getattr(layer, "input_global_scale", None):
+                if hasattr(layer,"act_bits") and layer.act_bits < 8 and not getattr(layer, "input_global_scale", None):
                     assert hasattr(layer, "act_max")
                     from auto_round.data_type.nvfp import calculate_gparam
 
@@ -198,7 +198,7 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs):
     for layer_name in layer_config:
         if (
             not layer_config[layer_name]["in_blocks"] and layer_config[layer_name]["bits"] <= 8
-        ):  ##lm head ##TODO fix act and so on
+        ):  ##lm head # TODO fix act and so on
             extra_config[layer_name] = {}
             extra_config[layer_name]["bits"] = layer_config[layer_name]["bits"]
             extra_config[layer_name]["data_type"] = layer_config[layer_name]["data_type"]
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index cf7d4d433..32be2fb52 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -238,11 +238,16 @@ def is_preset_scheme(name: str) -> bool:
 
 def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str:
     if isinstance(scheme, str) and scheme.upper().startswith("GGUF"):
-        return True
+        return scheme
     for key, val in PRESET_SCHEMES.items():
         if not key.upper().startswith("GGUF"):
             continue
-        if val == scheme:
+        equal = True
+        for scheme_key in val.keys():
+            if val[scheme_key] is not None and val[scheme_key] != scheme.get(scheme_key, None):
+                equal = False
+                break
+        if equal:
             return key
     return ""
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
index e865726b8..009d516d8 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2834,7 +2834,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
         supported_types = (*supported_types, torch.nn.Embedding)
 
     all_layer_names, embedding_layer_names = [], []
+    all_module_names = []
     for n, m in model.named_modules():
+        all_module_names.append(n)
         # cleanup stale attributes
         for key in scheme_keys:
             if hasattr(m, key):
@@ -2849,6 +2851,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
     for name in list(layer_config.keys()):
         if name in all_layer_names:
             continue
+        if name in all_module_names:
+            logger.warning_once(f"the type of `{name}` is not supported in your scheme, ignore it for now.")
+            continue
         regex = re.compile(name)
         matched = [ln for ln in all_layer_names if regex.search(ln)]
         if not matched:
diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
index 9511f0cf8..aac524800 100644
--- a/test/test_cpu/test_autoround.py
+++ b/test/test_cpu/test_autoround.py
@@ -720,6 +720,7 @@ def test_invalid_layer_config(self):
                 iters=1,
                 layer_config=layer_config,
             )
+            ar.quantize()
 
     def test_quant_lm_head(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B"

From 91722646068c4adcdbfefe058e3f486b58793a6f Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 30 Sep 2025 15:19:25 +0800
Subject: [PATCH 32/35] fix gguf issue

---
 auto_round/compressors/base.py |  6 ++++--
 auto_round/schemes.py          |  2 ++
 auto_round/utils.py            | 37 ++++++++++++++++++++++++++++++----
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 955971306..0731d0ba8 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -428,11 +428,13 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
 
     def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme:
         """Parse and set the quantization scheme."""
+        res= ""
         if isinstance(scheme, QuantizationScheme):
             scheme = asdict(scheme)
         elif isinstance(scheme, dict):
             scheme = scheme
         elif isinstance(scheme, str):
+            res = scheme # gguf:q4_k_s and gguf_q4_k_m has the same dict scheme, but the result is different
             scheme = scheme.upper()
             scheme = asdict(preset_name_to_scheme(scheme))
         scheme_keys = [f.name for f in fields(QuantizationScheme)]
@@ -481,7 +483,7 @@ def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kw
                     break
         for key in scheme_keys:
             scheme[key] = getattr(self, key)
-        return QuantizationScheme.from_dict(scheme)
+        return res if res else QuantizationScheme.from_dict(scheme)
 
     def _adjust_torch_compile(self, enable_torch_compile: bool) -> None:
         """Sets the torch compile configuration for the tuning."""
@@ -804,7 +806,7 @@ def remove_duplicates(lst):
 
         for f in formats:
             if f.startswith("gguf"):
-                self.scheme = preset_name_to_scheme(f)
+                self.scheme = f.upper()
                 break
 
         for format_ in formats:
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index 32be2fb52..cde37a0c9 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -239,6 +239,8 @@ def is_preset_scheme(name: str) -> bool:
 def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str:
     if isinstance(scheme, str) and scheme.upper().startswith("GGUF"):
         return scheme
+    if isinstance(scheme, str):
+        return ""
     for key, val in PRESET_SCHEMES.items():
         if not key.upper().startswith("GGUF"):
             continue
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 009d516d8..84390fa43 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -35,7 +35,7 @@
 
 from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType
 from auto_round.logger import logger
-from auto_round.schemes import QuantizationScheme, preset_name_to_scheme
+from auto_round.schemes import QuantizationScheme, preset_name_to_scheme, get_gguf_scheme
 
 SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings")
 
@@ -1940,6 +1940,30 @@ def _set_config(config, target_config):
                 )
                 new_type = new_type[:bits_index] + target_bits + new_type[bits_index + 1 :]
             else:
+                config_tmp = config.copy()
+                scheme_keys = [f.name for f in fields(QuantizationScheme)]
+                for key in config.keys():
+                    if key not in scheme_keys:
+                        config_tmp.pop(key, None)
+                matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched
+                if not matched_scheme:
+                    if config.get("super_group_size", None) is not None:
+                        new_type = new_type[:bits_index] + str(config["bits"]) + "_k"
+                    if config.get("super_group_size", None) is None or new_type not in GGUF_INNER_CONFIG:
+                        if config.get("sym", True):
+                            new_type = new_type[:bits_index] + str(config["bits"]) + "_0"
+                            if new_type not in GGUF_INNER_CONFIG:
+                                new_type = new_type[:bits_index] + str(config["bits"]) + "_1"
+                        if not config.get("sym", True):
+                            new_type = new_type[:bits_index] + str(config["bits"]) + "_1"
+                            if new_type not in GGUF_INNER_CONFIG:
+                                new_type = new_type[:bits_index] + str(config["bits"]) + "_0"
+                    if new_type not in GGUF_INNER_CONFIG:
+                        raise ValueError(f"the setting in layer_config {layer_name} "
+                                         f"could not match any supported gguf format, please have a check.")
+                    else:
+                        logger.warning_once(f"the setting in layer_config {layer_name} "
+                                            f"could not match any supported gguf format, reset to {new_type}")
                 new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :]
             new_type = _search_gguf_type(new_type)
             if new_type is None:
@@ -2747,7 +2771,7 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
 def set_layer_config(
     model: torch.nn.Module,
     layer_config: dict[str, Union[str, dict, "QuantizationScheme"]],
-    default_scheme: "QuantizationScheme",
+    default_scheme: Union[str, "QuantizationScheme"],
     default_scale_dtype: torch.dtype | str,
     supported_types: tuple,
     inner_supported_types: tuple,
@@ -2822,11 +2846,14 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
                 cfg["act_bits"] = b
 
     # 4. fill defaults
-    default_dict = asdict(default_scheme)
+    if isinstance(default_scheme,str):
+        default_dict = asdict(preset_name_to_scheme(default_scheme.upper()))
+    else:
+        default_dict = asdict(default_scheme)
     default_dict["scale_dtype"] = default_scale_dtype
     for cfg in layer_config.values():
         for key in scheme_keys:
-            cfg.setdefault(key, default_dict.get(key))
+            cfg.setdefault(key, default_dict.copy().get(key))
 
     # 5. collect supported modules
     gguf_name = get_gguf_scheme(default_scheme)
@@ -2914,6 +2941,8 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
         layer_config[lm_head_name] = cfg
         has_qlayer_outside_block = True
     for emd_name in embedding_layer_names:
+        if emd_name in layer_config:
+            continue
         cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]]
         cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype}
         layer_config[emd_name] = cfg

From f98092c6e6b4e53e9d4653ec21fe4e7fa69a0ee3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 07:25:15 +0000
Subject: [PATCH 33/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py                 |  5 ++---
 .../export_to_autoround/export_to_nvfp_mxfp.py |  2 +-
 auto_round/utils.py                            | 18 +++++++++++-------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index f957b2a5c..f1036ecac 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -328,7 +328,6 @@ def __init__(
             self.device_map = None
         self._set_device_map_in_blocks(self.device_map)
 
-
         # Tuning hyperparameters
         self.seed = seed
         set_seed(self.seed)
@@ -416,13 +415,13 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
 
     def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme:
         """Parse and set the quantization scheme."""
-        res= ""
+        res = ""
         if isinstance(scheme, QuantizationScheme):
             scheme = asdict(scheme)
         elif isinstance(scheme, dict):
             scheme = scheme
         elif isinstance(scheme, str):
-            res = scheme # gguf:q4_k_s and gguf_q4_k_m has the same dict scheme, but the result is different
+            res = scheme  # gguf:q4_k_s and gguf_q4_k_m has the same dict scheme, but the result is different
             scheme = scheme.upper()
             scheme = asdict(preset_name_to_scheme(scheme))
         scheme_keys = [f.name for f in fields(QuantizationScheme)]
diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
index 240a94899..eaf3ad9ae 100644
--- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
+++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -174,7 +174,7 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs):
         for n, m in model.named_modules():
             if type(m) in SUPPORTED_LAYER_TYPES:
                 layer = m
-                if hasattr(layer,"act_bits") and layer.act_bits < 8 and not getattr(layer, "input_global_scale", None):
+                if hasattr(layer, "act_bits") and layer.act_bits < 8 and not getattr(layer, "input_global_scale", None):
                     assert hasattr(layer, "act_max")
                     from auto_round.data_type.nvfp import calculate_gparam
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 187fc883d..a1c411373 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -35,7 +35,7 @@
 
 from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType
 from auto_round.logger import logger
-from auto_round.schemes import QuantizationScheme, preset_name_to_scheme, get_gguf_scheme
+from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme
 
 SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings")
 
@@ -1949,7 +1949,7 @@ def _set_config(config, target_config):
                 for key in config.keys():
                     if key not in scheme_keys:
                         config_tmp.pop(key, None)
-                matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched
+                matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp))  # check matched
                 if not matched_scheme:
                     if config.get("super_group_size", None) is not None:
                         new_type = new_type[:bits_index] + str(config["bits"]) + "_k"
@@ -1963,11 +1963,15 @@ def _set_config(config, target_config):
                             if new_type not in GGUF_INNER_CONFIG:
                                 new_type = new_type[:bits_index] + str(config["bits"]) + "_0"
                     if new_type not in GGUF_INNER_CONFIG:
-                        raise ValueError(f"the setting in layer_config {layer_name} "
-                                         f"could not match any supported gguf format, please have a check.")
+                        raise ValueError(
+                            f"the setting in layer_config {layer_name} "
+                            f"could not match any supported gguf format, please have a check."
+                        )
                     else:
-                        logger.warning_once(f"the setting in layer_config {layer_name} "
-                                            f"could not match any supported gguf format, reset to {new_type}")
+                        logger.warning_once(
+                            f"the setting in layer_config {layer_name} "
+                            f"could not match any supported gguf format, reset to {new_type}"
+                        )
                 new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :]
             new_type = _search_gguf_type(new_type)
             if new_type is None:
@@ -2850,7 +2854,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
                 cfg["act_bits"] = b
 
     # 4. fill defaults
-    if isinstance(default_scheme,str):
+    if isinstance(default_scheme, str):
         default_dict = asdict(preset_name_to_scheme(default_scheme.upper()))
     else:
         default_dict = asdict(default_scheme)

From 033d1f6ed3e4b4a128a07c964af01b156c820704 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 30 Sep 2025 16:48:32 +0800
Subject: [PATCH 34/35] update a little

---
 auto_round/auto_schemes/gen_scheme.py | 66 ++++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 7 deletions(-)

diff --git a/auto_round/auto_schemes/gen_scheme.py b/auto_round/auto_schemes/gen_scheme.py
index ba6b0a679..03c253a6a 100644
--- a/auto_round/auto_schemes/gen_scheme.py
+++ b/auto_round/auto_schemes/gen_scheme.py
@@ -17,16 +17,68 @@
 import torch
 
 from auto_round import AutoScheme
+from auto_round.utils import get_layer_features
 
 
 class GenScheme:
     def __init__(
-        self,
-        auto_scheme: AutoScheme,
-        model: torch.nn.Module,
-        quant_layer_names: Iterable[str],
-        fixed_layer_scheme: dict[str, dict],
-        scale_dtype: str = "fp16",
-        dataset="pile-10k",
+            self,
+            auto_scheme: AutoScheme,
+            model: torch.nn.Module,
+            quant_layer_names: Iterable[str],
+            fixed_layer_scheme: dict[str, dict],
+            scale_dtype: str = "fp16",
+            dataset="pile-10k",
     ):
+        self.auto_scheme = auto_scheme
+        self.model = model
+        self.quant_layer_names = quant_layer_names
+        self.fixed_layer_scheme = fixed_layer_scheme
+        self.scale_dtype = scale_dtype
+        self.dataset = dataset
+
+    def _get_min_max_avg_bits(self) -> tuple[float, float]:
         pass
+
+    # not validate yet
+    def get_layer_bits(self, layer):
+        weight = layer.weight
+        n_param = weight.numel()
+        weight_bits = getattr(layer, 'bits', 16)
+        group_size = getattr(layer, 'group_size', 128)
+        super_group_size = getattr(layer, 'super_group_size', None)
+        super_weight_bits = getattr(layer, 'super_bits', None)
+
+        # Main quantization cost
+        weight_total_bits = weight_bits * n_param
+        if weight_bits>=16: # Unquantized layer
+            return weight_total_bits, 16
+
+        in_features, output_features = get_layer_features(layer)
+        # Determine number of groups
+        if group_size > 0:  # group-wise
+            n_group = output_features * (in_features + group_size - 1) // group_size
+        elif group_size == 0:  # per-tensor
+            n_group = 1
+        elif group_size == -1:  # per-channel
+            n_group = output_features  # out_channels
+        else:
+            raise ValueError(f"Invalid group_size {group_size}")
+        aux_total_bits = 0
+        if not super_group_size:
+            # Scale and zero point bitwidths
+            scale_bits = 16
+            zp_bits = weight_bits if not super_group_size else 32  # default: same as weight_bits
+            # Overhead from scales and zero points
+            aux_total_bits = n_group * (scale_bits + zp_bits)
+
+        # Double quantization case
+        if super_group_size:
+            # Number of super-groups
+            aux_total_bits+=n_group*super_weight_bits * 2 #sclae and min int count
+            n_super_group = (n_group + super_group_size - 1) // super_group_size
+            aux_total_bits += n_super_group * 32 * 2 # double quant scale and min_v
+
+        total_bits = weight_total_bits + aux_total_bits
+        avg_bits = total_bits / n_param
+        return total_bits, avg_bits

From 8ae1dfa56220727b39c2c29eacb22a98ab998e10 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 08:50:03 +0000
Subject: [PATCH 35/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/auto_schemes/gen_scheme.py | 28 +++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/auto_round/auto_schemes/gen_scheme.py b/auto_round/auto_schemes/gen_scheme.py
index 03c253a6a..e009e12de 100644
--- a/auto_round/auto_schemes/gen_scheme.py
+++ b/auto_round/auto_schemes/gen_scheme.py
@@ -22,13 +22,13 @@
 
 class GenScheme:
     def __init__(
-            self,
-            auto_scheme: AutoScheme,
-            model: torch.nn.Module,
-            quant_layer_names: Iterable[str],
-            fixed_layer_scheme: dict[str, dict],
-            scale_dtype: str = "fp16",
-            dataset="pile-10k",
+        self,
+        auto_scheme: AutoScheme,
+        model: torch.nn.Module,
+        quant_layer_names: Iterable[str],
+        fixed_layer_scheme: dict[str, dict],
+        scale_dtype: str = "fp16",
+        dataset="pile-10k",
     ):
         self.auto_scheme = auto_scheme
         self.model = model
@@ -44,14 +44,14 @@ def _get_min_max_avg_bits(self) -> tuple[float, float]:
     def get_layer_bits(self, layer):
         weight = layer.weight
         n_param = weight.numel()
-        weight_bits = getattr(layer, 'bits', 16)
-        group_size = getattr(layer, 'group_size', 128)
-        super_group_size = getattr(layer, 'super_group_size', None)
-        super_weight_bits = getattr(layer, 'super_bits', None)
+        weight_bits = getattr(layer, "bits", 16)
+        group_size = getattr(layer, "group_size", 128)
+        super_group_size = getattr(layer, "super_group_size", None)
+        super_weight_bits = getattr(layer, "super_bits", None)
 
         # Main quantization cost
         weight_total_bits = weight_bits * n_param
-        if weight_bits>=16: # Unquantized layer
+        if weight_bits >= 16:  # Unquantized layer
             return weight_total_bits, 16
 
         in_features, output_features = get_layer_features(layer)
@@ -75,9 +75,9 @@ def get_layer_bits(self, layer):
         # Double quantization case
         if super_group_size:
             # Number of super-groups
-            aux_total_bits+=n_group*super_weight_bits * 2 #sclae and min int count
+            aux_total_bits += n_group * super_weight_bits * 2  # sclae and min int count
             n_super_group = (n_group + super_group_size - 1) // super_group_size
-            aux_total_bits += n_super_group * 32 * 2 # double quant scale and min_v
+            aux_total_bits += n_super_group * 32 * 2  # double quant scale and min_v
 
         total_bits = weight_total_bits + aux_total_bits
         avg_bits = total_bits / n_param