From 6ffcf60c1c32c28f71fb45fd6eaa7fb50657d076 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 25 Sep 2025 14:14:42 +0800 Subject: [PATCH 01/35] try to enable auto_scheme API --- auto_round/__init__.py | 3 +- auto_round/__main__.py | 9 ++++ auto_round/auto_schemes/__init__.py | 24 +++++++++ auto_round/autoround.py | 5 +- auto_round/compressors/base.py | 76 +++++++++++++++-------------- auto_round/data_type/register.py | 3 +- auto_round/schemes.py | 29 +++++++++-- 7 files changed, 103 insertions(+), 46 deletions(-) create mode 100644 auto_round/auto_schemes/__init__.py diff --git a/auto_round/__init__.py b/auto_round/__init__.py index 15bbc373d..1ce7d5e1e 100644 --- a/auto_round/__init__.py +++ b/auto_round/__init__.py @@ -13,11 +13,10 @@ # limitations under the License. from auto_round.autoround import AutoRound -# support for old api from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam +from auto_round.schemes import QuantizationScheme, AutoScheme from auto_round.utils import LazyImport - def __getattr__(name): if name == "AutoHfQuantizer": from auto_round.inference.auto_quantizer import AutoHfQuantizer diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 78a8fc9d6..5c77c2f10 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -466,6 +466,13 @@ def tune(args): extra_config.tuning_config = tuning_config extra_config.scheme_config = scheme_config extra_config.mllm_config = mllm_config + layer_config = {} + from auto_round.auto_schemes.delta_loss import get_mixed_config_layer_config + best_path = get_mixed_config_layer_config(model_name,target_bits=6) + for item in best_path: + layer_config[item[0]] = {} + layer_config[item[0]]["bits"] = item[1] + layer_config[item[0]]["act_bits"] = item[1] autoround: BaseCompressor = AutoRound( model=model_name, @@ -484,6 +491,8 @@ def tune(args): not_use_best_mse=args.not_use_best_mse, enable_adam=args.adam, extra_config=extra_config, + layer_config=layer_config + ) model_name = args.model.rstrip("/") diff --git a/auto_round/auto_schemes/__init__.py b/auto_round/auto_schemes/__init__.py new file mode 100644 index 000000000..38d40e023 --- /dev/null +++ b/auto_round/auto_schemes/__init__.py @@ -0,0 +1,24 @@ +AUTO_SCHEMES_ALGS = {} + +def register_dtype(names): + """Class decorator to register a mixed precision algorithm to the registry. + + Decorator function used before a Pattern subclass. + + Args: + names: A string. Define the export type. + + Returns: + cls: The class of register. + """ + + def register(alg): + if isinstance(names, (tuple, list)): + for name in names: + AUTO_SCHEMES_ALGS[name] = alg + else: + AUTO_SCHEMES_ALGS[names] = alg + + return alg + + return register diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 4074213a9..22d3dc29b 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -25,7 +25,7 @@ MLLMCompressor, ) from auto_round.logger import deprecated, logger -from auto_round.schemes import QuantizationScheme +from auto_round.schemes import QuantizationScheme, AutoScheme from auto_round.utils import is_mllm_model @@ -64,6 +64,7 @@ def __new__( model: Union[torch.nn.Module, str], tokenizer=None, scheme: Union[str, dict, QuantizationScheme] = "W4A16", + auto_scheme: AutoScheme = None, layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", iters: int = 200, @@ -77,7 +78,6 @@ def __new__( seed: int = 42, # for adam enable_adam: bool = False, - # for MLLM extra_config: ExtraConfig = None, **kwargs, ) -> BaseCompressor: @@ -159,6 +159,7 @@ def __new__( model=model, tokenizer=tokenizer, scheme=scheme, + auto_scheme=auto_scheme, layer_config=layer_config, dataset=dataset, iters=iters, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 01546034d..fa7ee0bf8 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -35,7 +35,7 @@ from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType from auto_round.logger import logger from auto_round.low_cpu_mem.utils import get_layers_before_block -from auto_round.schemes import QuantizationScheme, preset_name_to_scheme +from auto_round.schemes import QuantizationScheme, preset_name_to_scheme, AutoScheme from auto_round.sign_sgd import SignSGD from auto_round.special_model_handler import _handle_moe_model from auto_round.utils import ( @@ -130,6 +130,7 @@ def __init__( model: Union[torch.nn.Module, str], tokenizer=None, scheme: Union[str, dict, QuantizationScheme] = "W4A16", + auto_scheme: AutoScheme = None, layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", iters: int = 200, @@ -204,7 +205,6 @@ def __init__( """ self.scheme = None self._parse_and_set_scheme(scheme, kwargs) - # Extra/legacy kwargs for backward compatibility # Major version releases may pack them with extra configuration options amp = kwargs.pop("amp", True) @@ -237,7 +237,7 @@ def __init__( logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" - # deprecated, default not to use torch.use_deterministic_algorithms + # Deprecated, default not to use torch.use_deterministic_algorithms if not disable_deterministic_algorithms or enable_deterministic_algorithms: if not disable_deterministic_algorithms: logger.warning( @@ -255,26 +255,14 @@ def __init__( if device_map is None: device_map = 0 - # Set device, must place after model loading - self._set_device(device_map) - - if (isinstance(device_map, dict) and device_map) or device_map == "auto": - self.device_map = device_map - elif isinstance(device_map, str) and "," in device_map: - device_map = device_map.replace(" ", "") # Remove any spaces - self.device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()] - self.device_map = "auto" - else: - self.device_map = None - self._set_device_map_in_blocks(self.device_map) # Model related self.quantized = False if isinstance(model, str): model, tokenizer, low_cpu_mem_usage = llm_load_model( model, - device="cpu", - low_cpu_mem_mode=low_cpu_mem_usage, # always load cpu first + device="cpu", # always load cpu first + low_cpu_mem_mode=low_cpu_mem_usage, ) elif tokenizer is None and iters > 0: raise ValueError("A tokenizer must be set for non-str model input") @@ -289,17 +277,23 @@ def __init__( self.tokenizer = tokenizer self.shared_cache_keys = get_shared_keys(self.model) - not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers) - if len(not_quantize_layer_names) > 0: - logger.info(f"{not_quantize_layer_names} will not be quantized.") - if layer_config is None: - layer_config = {} - for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"} - self._parse_layer_config(layer_config) # must place after model init + self._parse_layer_config(layer_config, fp_layers) # must place after model init self.to_quant_block_names = to_quant_block_names + # Set device, must place after model loading + self._set_device(device_map) + + if (isinstance(device_map, dict) and device_map) or device_map == "auto": + self.device_map = device_map + elif isinstance(device_map, str) and "," in device_map: + device_map = device_map.replace(" ", "") # Remove any spaces + self.device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()] + self.device_map = "auto" + else: + self.device_map = None + self._set_device_map_in_blocks(self.device_map) + # Tuning hyperparameters self.seed = seed set_seed(self.seed) @@ -385,7 +379,7 @@ def __init__( import habana_frameworks.torch.core as htcore # pylint: disable=E0401 import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401] - def _set_device(self, device_map): + def _set_device(self, device_map:Union[str, torch.device, int,dict])->None: if hasattr(self, "device") and self.device is not None: return if isinstance(device_map, (str, torch.device, int)): @@ -409,8 +403,16 @@ def _set_device(self, device_map): else: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") - def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]]) -> None: + def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers) -> None: """Parse and set the layer-wise quantization configuration.""" + not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers) + if len(not_quantize_layer_names) > 0: + logger.info(f"{not_quantize_layer_names} will not be quantized.") + if layer_config is None: + layer_config = {} + for name in not_quantize_layer_names: + layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"} + # Some other quantization configs self.layer_config = {} if layer_config is None else layer_config scheme_keys = [f.name for f in fields(QuantizationScheme)] @@ -1709,7 +1711,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: # It is best to modify the model structure in the quantize function and check the format, # because it may cause the gguf format to not be exported normally. self.model = _handle_moe_model(self.model, formats=formats) - self.has_qlayer_outside_block = self._set_layerwise_config(self.layer_config) + self.has_qlayer_outside_block = self._set_layerwise_config(model, self.layer_config) if not hasattr(self, "formats"): logger.warning("this API is deprecated, please use `quantize_and_save` instead") else: @@ -1935,7 +1937,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: del layer_input clear_memory(q_layer_input) - def _set_layerwise_config(self, layer_config: dict) -> bool: + def _set_layerwise_config(self, model:torch.nn.Module, layer_config: dict) -> bool: """ Sets the layer-wise configuration based on the provided `layer_config`. By default, only quantize layers in blocks. @@ -1950,14 +1952,14 @@ def _set_layerwise_config(self, layer_config: dict) -> bool: # Get the names of layers in quantization blocks supported_types = self.supported_types layers_in_blocks = get_layer_names_in_block( - self.model, supported_types, self.quant_block_list, self.inner_supported_types + model, supported_types, self.quant_block_list, self.inner_supported_types ) - ##process regex in layer_config + # Process regex in layer_config all_supported_layer_names = [] # List of configuration keys keys = get_quant_keys() - for n, m in self.model.named_modules(): + for n, m in model.named_modules(): # Delete previous configuration to avoid conflicts with prior tuning for key in keys: if hasattr(m, key): @@ -1981,7 +1983,7 @@ def _set_layerwise_config(self, layer_config: dict) -> bool: for match_name in matched_names: layer_config[match_name] = val else: - tmp_m = get_module(self.model, name) + tmp_m = get_module(model, name) if not isinstance(tmp_m, torch.nn.Embedding): # TODO not good code style raise ValueError(f"key {name} in layer_config is invalid, please have a double check") @@ -1989,17 +1991,17 @@ def _set_layerwise_config(self, layer_config: dict) -> bool: # Iterate through all modules in the model is_gguf = hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats) - for n, m in self.model.named_modules(): + for n, m in model.named_modules(): # Skip unsupported types if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types: - if n in self.layer_config: + if n in layer_config: if not isinstance(m, torch.nn.Embedding): logger.warning(f"{n} is not supported, layer_config {n}: {layer_config[n]} will be ignored.") - self.layer_config.pop(n) + layer_config.pop(n) continue if not is_gguf: if not check_to_quantized(layer_config[n]): - self.layer_config.pop(n) + layer_config.pop(n) continue else: continue diff --git a/auto_round/data_type/register.py b/auto_round/data_type/register.py index 12c4406a4..fca259ed6 100644 --- a/auto_round/data_type/register.py +++ b/auto_round/data_type/register.py @@ -22,8 +22,7 @@ def register_dtype(names): Decorator function used before a Pattern subclass. Args: - cls (class): The subclass of register. - name: A string. Define the export type. + names: A string. Define the export type. Returns: cls: The class of register. diff --git a/auto_round/schemes.py b/auto_round/schemes.py index a5c5975c9..7b6cf2f4d 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -14,9 +14,9 @@ import copy from copy import deepcopy from dataclasses import dataclass, fields -from typing import Generator, List, Optional +from typing import Optional, Iterable -__all__ = ["QuantizationScheme", "preset_name_to_scheme"] +__all__ = ["QuantizationScheme", "preset_name_to_scheme", "AutoScheme"] @dataclass @@ -38,7 +38,7 @@ def from_dict(cls, config: dict): return cls(**config) @classmethod - def get_attributes(cls: "QuantizationScheme") -> List[str]: + def get_attributes(cls: "QuantizationScheme") -> list[str]: return [field.name for field in fields(cls)] def __getitem__(self, key: str): @@ -180,6 +180,8 @@ def is_preset_scheme(name: str) -> bool: } ) + + # FP8 = asdict(QuantArgs.from_dict({ # "bits": 8, # "group_size": 128, @@ -201,6 +203,18 @@ def is_preset_scheme(name: str) -> bool: } ) +# For AutoScheme 16 bits options +BF16 = QuantizationScheme.from_dict( + { + "bits": 16, + "group_size": 0, + "data_type": "fp", + "act_bits": 16, + "act_data_type": "fp", + } +) + + PRESET_SCHEMES = { "W4A16": W4A16, "W2A16": W2A16, @@ -211,6 +225,7 @@ def is_preset_scheme(name: str) -> bool: "NVFP4": NVFP4, "FPW8A16": FPW8A16, "FP8_STATIC": FP8_STATIC, + "BF16": BF16, } from auto_round.export.export_to_gguf.config import GGUF_CONFIG @@ -220,3 +235,11 @@ def is_preset_scheme(name: str) -> bool: value.pop("embedding", None) value.pop("lm_head", None) PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value) + + +@dataclass +class AutoScheme: + options:Optional[Iterable[QuantizationScheme]] + target_bits:float + shared_layers:Optional[Iterable[Iterable[str]]]=None + method:str="naive_pre" \ No newline at end of file From 5d80825baa9643790b1ada4061d32818ec82bb04 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Sep 2025 06:15:51 +0000 Subject: [PATCH 02/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/__init__.py | 1 + auto_round/__main__.py | 6 +++--- auto_round/auto_schemes/__init__.py | 15 +++++++++++++++ auto_round/autoround.py | 2 +- auto_round/compressors/base.py | 9 ++++----- auto_round/schemes.py | 11 +++++------ 6 files changed, 29 insertions(+), 15 deletions(-) diff --git a/auto_round/__init__.py b/auto_round/__init__.py index 1ce7d5e1e..d7be4984c 100644 --- a/auto_round/__init__.py +++ b/auto_round/__init__.py @@ -17,6 +17,7 @@ from auto_round.schemes import QuantizationScheme, AutoScheme from auto_round.utils import LazyImport + def __getattr__(name): if name == "AutoHfQuantizer": from auto_round.inference.auto_quantizer import AutoHfQuantizer diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 5c77c2f10..a69359db8 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -468,7 +468,8 @@ def tune(args): extra_config.mllm_config = mllm_config layer_config = {} from auto_round.auto_schemes.delta_loss import get_mixed_config_layer_config - best_path = get_mixed_config_layer_config(model_name,target_bits=6) + + best_path = get_mixed_config_layer_config(model_name, target_bits=6) for item in best_path: layer_config[item[0]] = {} layer_config[item[0]]["bits"] = item[1] @@ -491,8 +492,7 @@ def tune(args): not_use_best_mse=args.not_use_best_mse, enable_adam=args.adam, extra_config=extra_config, - layer_config=layer_config - + layer_config=layer_config, ) model_name = args.model.rstrip("/") diff --git a/auto_round/auto_schemes/__init__.py b/auto_round/auto_schemes/__init__.py index 38d40e023..d3b055be2 100644 --- a/auto_round/auto_schemes/__init__.py +++ b/auto_round/auto_schemes/__init__.py @@ -1,5 +1,20 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + AUTO_SCHEMES_ALGS = {} + def register_dtype(names): """Class decorator to register a mixed precision algorithm to the registry. diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 22d3dc29b..ae1a37677 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -25,7 +25,7 @@ MLLMCompressor, ) from auto_round.logger import deprecated, logger -from auto_round.schemes import QuantizationScheme, AutoScheme +from auto_round.schemes import AutoScheme, QuantizationScheme from auto_round.utils import is_mllm_model diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index fa7ee0bf8..72ca17ddc 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -35,7 +35,7 @@ from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType from auto_round.logger import logger from auto_round.low_cpu_mem.utils import get_layers_before_block -from auto_round.schemes import QuantizationScheme, preset_name_to_scheme, AutoScheme +from auto_round.schemes import AutoScheme, QuantizationScheme, preset_name_to_scheme from auto_round.sign_sgd import SignSGD from auto_round.special_model_handler import _handle_moe_model from auto_round.utils import ( @@ -255,13 +255,12 @@ def __init__( if device_map is None: device_map = 0 - # Model related self.quantized = False if isinstance(model, str): model, tokenizer, low_cpu_mem_usage = llm_load_model( model, - device="cpu", # always load cpu first + device="cpu", # always load cpu first low_cpu_mem_mode=low_cpu_mem_usage, ) elif tokenizer is None and iters > 0: @@ -379,7 +378,7 @@ def __init__( import habana_frameworks.torch.core as htcore # pylint: disable=E0401 import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401] - def _set_device(self, device_map:Union[str, torch.device, int,dict])->None: + def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: if hasattr(self, "device") and self.device is not None: return if isinstance(device_map, (str, torch.device, int)): @@ -1937,7 +1936,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: del layer_input clear_memory(q_layer_input) - def _set_layerwise_config(self, model:torch.nn.Module, layer_config: dict) -> bool: + def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> bool: """ Sets the layer-wise configuration based on the provided `layer_config`. By default, only quantize layers in blocks. diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 7b6cf2f4d..af51a881e 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -14,7 +14,7 @@ import copy from copy import deepcopy from dataclasses import dataclass, fields -from typing import Optional, Iterable +from typing import Iterable, Optional __all__ = ["QuantizationScheme", "preset_name_to_scheme", "AutoScheme"] @@ -181,7 +181,6 @@ def is_preset_scheme(name: str) -> bool: ) - # FP8 = asdict(QuantArgs.from_dict({ # "bits": 8, # "group_size": 128, @@ -239,7 +238,7 @@ def is_preset_scheme(name: str) -> bool: @dataclass class AutoScheme: - options:Optional[Iterable[QuantizationScheme]] - target_bits:float - shared_layers:Optional[Iterable[Iterable[str]]]=None - method:str="naive_pre" \ No newline at end of file + options: Optional[Iterable[QuantizationScheme]] + target_bits: float + shared_layers: Optional[Iterable[Iterable[str]]] = None + method: str = "naive_pre" From a4ef4950ad11a523e6c2679384c00b5b4ceadaf6 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 25 Sep 2025 14:19:25 +0800 Subject: [PATCH 03/35] update a little --- auto_round/__main__.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index a69359db8..adafd095e 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -466,14 +466,6 @@ def tune(args): extra_config.tuning_config = tuning_config extra_config.scheme_config = scheme_config extra_config.mllm_config = mllm_config - layer_config = {} - from auto_round.auto_schemes.delta_loss import get_mixed_config_layer_config - - best_path = get_mixed_config_layer_config(model_name, target_bits=6) - for item in best_path: - layer_config[item[0]] = {} - layer_config[item[0]]["bits"] = item[1] - layer_config[item[0]]["act_bits"] = item[1] autoround: BaseCompressor = AutoRound( model=model_name, @@ -491,8 +483,7 @@ def tune(args): fp_layers=args.fp_layers, not_use_best_mse=args.not_use_best_mse, enable_adam=args.adam, - extra_config=extra_config, - layer_config=layer_config, + extra_config=extra_config ) model_name = args.model.rstrip("/") From 4173c3eb7626b8509896d49c30958029ced1864e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Sep 2025 06:20:01 +0000 Subject: [PATCH 04/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index adafd095e..78a8fc9d6 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -483,7 +483,7 @@ def tune(args): fp_layers=args.fp_layers, not_use_best_mse=args.not_use_best_mse, enable_adam=args.adam, - extra_config=extra_config + extra_config=extra_config, ) model_name = args.model.rstrip("/") From 87e945407d7a09cb083a387be574681eee3a1ce0 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 25 Sep 2025 17:04:32 +0800 Subject: [PATCH 05/35] update a little --- auto_round/compressors/base.py | 5 ++--- auto_round/utils.py | 18 ------------------ 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 72ca17ddc..f08540207 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -75,7 +75,6 @@ get_lm_head_name, get_max_vram, get_module, - get_quant_keys, get_shared_keys, htcore, infer_bits_by_data_type, @@ -1710,7 +1709,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: # It is best to modify the model structure in the quantize function and check the format, # because it may cause the gguf format to not be exported normally. self.model = _handle_moe_model(self.model, formats=formats) - self.has_qlayer_outside_block = self._set_layerwise_config(model, self.layer_config) + self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config) if not hasattr(self, "formats"): logger.warning("this API is deprecated, please use `quantize_and_save` instead") else: @@ -1956,7 +1955,7 @@ def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> b # Process regex in layer_config all_supported_layer_names = [] # List of configuration keys - keys = get_quant_keys() + keys = [f.name for f in fields(QuantizationScheme)] for n, m in model.named_modules(): # Delete previous configuration to avoid conflicts with prior tuning diff --git a/auto_round/utils.py b/auto_round/utils.py index 9af09758e..131a91db3 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2308,24 +2308,6 @@ def convert_fp8_model_to_16b_model(model, dtype=torch.bfloat16): return model -def get_quant_keys(): - keys = [ - "bits", - "group_size", - "sym", - "data_type", - "scale_dtype", - "act_bits", - "act_group_size", - "act_sym", - "act_dynamic", - "act_data_type", - "super_bits", - "super_group_size", - ] - return keys - - def out_of_vram(error_msg): error_msg = str(error_msg) # CUDA From 242d1ee29eb4833f0f2aba2d322e0380f8712ea5 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 25 Sep 2025 17:28:48 +0800 Subject: [PATCH 06/35] try to refine parse layer config code --- auto_round/compressors/base.py | 179 +++++++++++++++++++++++++++++++++ auto_round/utils.py | 5 +- 2 files changed, 182 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index f08540207..cbe95703b 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -401,6 +401,185 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: else: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") + + + # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different + def _convert_value_layer_config_to_dict(self, + layer_config: dict[str, Union[str, dict, QuantizationScheme]]) -> dict: + + new_layer_config = {} if layer_config is None else layer_config + scheme_keys = [f.name for f in fields(QuantizationScheme)] + for key, item in new_layer_config.items(): + if isinstance(item, str): + item = asdict(preset_name_to_scheme(item.upper())) + new_layer_config[key] = item + elif isinstance(item, QuantizationScheme): + config = asdict(item) + tmp_keys = copy.deepcopy(list(config.keys())) + for tmp_key in tmp_keys: # Pop None value to be overridden + if config[tmp_key] is None: + config.pop(tmp_key) + elif isinstance(item, dict): + item_keys = item.keys() + if item_keys not in scheme_keys: + for item_key in item_keys: + if item_key not in scheme_keys: + raise ValueError( + f"the key {item_key} in layer_config for layer {key} is invalid," + f" only {scheme_keys} are supported" + ) + new_layer_config[key]["fixed_by_user"] = True + return new_layer_config + + def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, dict], fp_layers, quant_lm_head, + scheme, quant_block_list, supported_types, inner_supported_types): + """ + Sets the layer-wise configuration based on the provided `layer_config`. + By default, only quantize layers in blocks. + + Args: + layer_config (dict): The configuration dictionary for each layer containing various configuration options. + + Returns: + bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head), + otherwise returns False. + """ + + # set fp layers + not_quantize_layer_names = get_fp_layer_names(model, fp_layers) + # if len(not_quantize_layer_names) > 0: + # logger.info(f"{not_quantize_layer_names} will not be quantized.") + if layer_config is None: + layer_config = {} + for name in not_quantize_layer_names: + layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", + "act_data_type": "float", "fixed_by_user": True} + + # Get the names of layers in quantization blocks + layers_in_blocks = get_layer_names_in_block( + model, supported_types, quant_block_list, inner_supported_types + ) + # Process regex in layer_config + all_supported_layer_names = [] + # List of configuration keys + scheme_keys = (f.name for f in fields(QuantizationScheme)) + + for n, m in model.named_modules(): + # Delete previous configuration to avoid conflicts with prior tuning + for key in scheme_keys: + if hasattr(m, key): + delattr(m, key) + if type(m) not in supported_types and m.__class__.__name__ not in self.inner_supported_types: + continue + all_supported_layer_names.append(n) + + names_in_layer_config = list(layer_config.keys()) + for name in names_in_layer_config: + if name in all_supported_layer_names: + continue + matched_names = [] + for layer_name in all_supported_layer_names: + if re.search(re.compile(name), layer_name) is not None: + matched_names.append(layer_name) + if len(matched_names) > 0: + val = layer_config[name] + layer_config.pop(name) + for match_name in matched_names: + layer_config[match_name] = val + else: + tmp_m = get_module(model, name) + if type(tmp_m) != torch.nn.Embedding: # GGUF needs to quantize embedding layer + raise ValueError(f"key {name} in layer_config is invalid, please have a double check") + + has_qlayer_outside_block = False # Flag to track if there are quantized layers outside blocks (e.g., lm-head) + + # Iterate through all modules in the model + is_gguf = ("gguf" in scheme.lower() or + (hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats))) + for n, m in model.named_modules(): + # Skip unsupported types + if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types: + if n in layer_config: + if not isinstance(m, torch.nn.Embedding): + logger.warning(f"{n} is not supported, layer_config {n}: {layer_config[n]} will be ignored.") + layer_config.pop(n) + + if not is_gguf: # TODO the code here seems to could be deleted + if not check_to_quantized(layer_config[n]): + layer_config.pop(n) + + continue + + # If the layer is not in the config and is part of a quantization block, use default configuration + if n not in layer_config.keys() and n in layers_in_blocks: + layer_config[n] = {} + for key in scheme_keys: + layer_config[n][key] = getattr(self, key) + + # If the layer is partially configured, fill in missing values + elif n in layer_config.keys(): + if "data_type" in layer_config[n] and "bits" not in layer_config[n]: + tmp_bits = infer_bits_by_data_type(layer_config[n]["data_type"]) + if tmp_bits is not None and tmp_bits != self.bits: + logger.warning( + f"'data_type' do not match the specified 'bits' setting for {n}." + f" Resetting 'bits' to {tmp_bits}." + ) + layer_config[n]["bits"] = tmp_bits + if "act_data_type" in layer_config[n] and "act_bits" not in layer_config[n]: + tmp_bits = infer_bits_by_data_type(layer_config[n]["act_data_type"]) + if tmp_bits is not None and tmp_bits != self.act_bits: + logger.warning( + f"'act_data_type' do not match the specified 'act_bits' setting for {n}." + f" Resetting 'act_bits' to {tmp_bits}." + ) + layer_config[n]["act_bits"] = tmp_bits + + for key in scheme_keys: + if key not in layer_config[n].keys(): + layer_config[n][key] = getattr(self, key) + layer_config[n]["fixed_by_user"] = True + + # If the layer is not in the config and not part of a quantization block, + # use default configuration and set specific values + else: + layer_config[n] = {} + for key in scheme_keys: + layer_config[n][key] = getattr(self, key) + layer_config[n]["bits"] = 16 + layer_config[n]["act_bits"] = 16 + + if n in layers_in_blocks: + layer_config[n]["in_blocks"] = True + else: + layer_config[n]["in_blocks"] = False + + # If the layer is outside a block and requires quantization, mark it as a quantized layer outside the block + if ( + n not in layers_in_blocks + and check_to_quantized(layer_config[n]) + and not isinstance(m, torch.nn.Embedding) + ): + has_qlayer_outside_block = True + + in_features, out_features = get_layer_features(m) + if in_features <= layer_config[n]["group_size"]: + layer_config[n]["group_size"] = -1 + + # Apply the configuration to the corresponding layer in the model + for key in scheme_keys: + setattr(m, key, layer_config[n][key]) + + + # TODO self.quant_lm_head has not handleed yet + + need_to_quantize_lm_head = self._check_need_to_quantize_lm_head_embedding() + if need_to_quantize_lm_head: + has_qlayer_outside_block = True + + # Return whether there are quantized layers outside the blocks + return has_qlayer_outside_block + def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers) -> None: """Parse and set the layer-wise quantization configuration.""" not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers) diff --git a/auto_round/utils.py b/auto_round/utils.py index 131a91db3..bd3d1d2b5 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -21,6 +21,7 @@ import re import sys from collections import UserDict +from dataclasses import fields from enum import Enum from functools import lru_cache from pathlib import Path @@ -2278,8 +2279,8 @@ def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16): new_layer = torch.nn.Linear(layer.in_features, layer.out_features, bias=layer.bias is not None, dtype=dtype) if layer.bias is not None: new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype)) - - keys = get_quant_keys() + ["tmp_name"] + scheme_keys = [f.name for f in fields(QuantizationScheme)] + keys = scheme_keys + ["tmp_name"] for key in keys: setattr(new_layer, key, getattr(layer, key, None)) From 4fc6b64a56d0fa811f0f210833f366acebe9c918 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Sep 2025 09:29:46 +0000 Subject: [PATCH 07/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 61 ++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index cbe95703b..0cb65f332 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -401,11 +401,10 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: else: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") - - # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different - def _convert_value_layer_config_to_dict(self, - layer_config: dict[str, Union[str, dict, QuantizationScheme]]) -> dict: + def _convert_value_layer_config_to_dict( + self, layer_config: dict[str, Union[str, dict, QuantizationScheme]] + ) -> dict: new_layer_config = {} if layer_config is None else layer_config scheme_keys = [f.name for f in fields(QuantizationScheme)] @@ -431,19 +430,28 @@ def _convert_value_layer_config_to_dict(self, new_layer_config[key]["fixed_by_user"] = True return new_layer_config - def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, dict], fp_layers, quant_lm_head, - scheme, quant_block_list, supported_types, inner_supported_types): + def _expand_layer_config( + self, + model: torch.nn.Module, + layer_config: dict[str, dict], + fp_layers, + quant_lm_head, + scheme, + quant_block_list, + supported_types, + inner_supported_types, + ): """ - Sets the layer-wise configuration based on the provided `layer_config`. - By default, only quantize layers in blocks. + Sets the layer-wise configuration based on the provided `layer_config`. + By default, only quantize layers in blocks. - Args: - layer_config (dict): The configuration dictionary for each layer containing various configuration options. + Args: + layer_config (dict): The configuration dictionary for each layer containing various configuration options. - Returns: - bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head), - otherwise returns False. - """ + Returns: + bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head), + otherwise returns False. + """ # set fp layers not_quantize_layer_names = get_fp_layer_names(model, fp_layers) @@ -452,13 +460,16 @@ def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, d if layer_config is None: layer_config = {} for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", - "act_data_type": "float", "fixed_by_user": True} + layer_config[name] = { + "bits": 16, + "act_bits": 16, + "data_type": "float", + "act_data_type": "float", + "fixed_by_user": True, + } # Get the names of layers in quantization blocks - layers_in_blocks = get_layer_names_in_block( - model, supported_types, quant_block_list, inner_supported_types - ) + layers_in_blocks = get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types) # Process regex in layer_config all_supported_layer_names = [] # List of configuration keys @@ -494,8 +505,9 @@ def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, d has_qlayer_outside_block = False # Flag to track if there are quantized layers outside blocks (e.g., lm-head) # Iterate through all modules in the model - is_gguf = ("gguf" in scheme.lower() or - (hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats))) + is_gguf = "gguf" in scheme.lower() or ( + hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats) + ) for n, m in model.named_modules(): # Skip unsupported types if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types: @@ -556,9 +568,9 @@ def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, d # If the layer is outside a block and requires quantization, mark it as a quantized layer outside the block if ( - n not in layers_in_blocks - and check_to_quantized(layer_config[n]) - and not isinstance(m, torch.nn.Embedding) + n not in layers_in_blocks + and check_to_quantized(layer_config[n]) + and not isinstance(m, torch.nn.Embedding) ): has_qlayer_outside_block = True @@ -570,7 +582,6 @@ def _expand_layer_config(self, model: torch.nn.Module, layer_config: dict[str, d for key in scheme_keys: setattr(m, key, layer_config[n][key]) - # TODO self.quant_lm_head has not handleed yet need_to_quantize_lm_head = self._check_need_to_quantize_lm_head_embedding() From 7f76db26d3f3c8fe51b777928eb6ca078b22c138 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Sep 2025 05:08:34 +0000 Subject: [PATCH 08/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index e67adfc15..ecf87a62d 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -296,7 +296,7 @@ def __init__( for name in not_quantize_layer_names: layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"} self._parse_layer_config(layer_config) # must place after model init - + self.to_quant_block_names = to_quant_block_names # Set device, must place after model loading From ae8837b0b2bb1e8ef8ae03085d4b6b728977e495 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 26 Sep 2025 13:29:59 +0800 Subject: [PATCH 09/35] fix --- auto_round/compressors/base.py | 2 +- auto_round/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 53667c090..aaac722ba 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2145,7 +2145,7 @@ def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> b # Process regex in layer_config all_supported_layer_names = [] # List of configuration keys - keys = [f.name for f in fields(QuantizationScheme)] + keys = (f.name for f in fields(QuantizationScheme)) + ("scale_dtype") for n, m in model.named_modules(): # Delete previous configuration to avoid conflicts with prior tuning diff --git a/auto_round/utils.py b/auto_round/utils.py index dedaf5c2c..a48751d3e 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2279,8 +2279,8 @@ def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16): new_layer = torch.nn.Linear(layer.in_features, layer.out_features, bias=layer.bias is not None, dtype=dtype) if layer.bias is not None: new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype)) - scheme_keys = [f.name for f in fields(QuantizationScheme)] - keys = scheme_keys + ["tmp_name"] + scheme_keys = (f.name for f in fields(QuantizationScheme)) + keys = scheme_keys + ("tmp_name", "scale_dtype") for key in keys: setattr(new_layer, key, getattr(layer, key, None)) From 531224de42da3f9ed466bc30c15121ec5597e80f Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 26 Sep 2025 13:34:11 +0800 Subject: [PATCH 10/35] fix --- auto_round/compressors/base.py | 13 +++---------- auto_round/utils.py | 2 +- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 71df8b89f..8c0b699a2 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -230,7 +230,7 @@ def __init__( self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False # Scale factor for RAM usage per parameter. self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) - fp_layers = kwargs.pop("fp_layers", None) + fp_layers = kwargs.pop("fp_layers", "") if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") @@ -288,14 +288,7 @@ def __init__( self.device_map = None self._set_device_map_in_blocks(self.device_map) - not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers) - if len(not_quantize_layer_names) > 0: - logger.info(f"{not_quantize_layer_names} will not be quantized.") - if layer_config is None: - layer_config = {} - for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"} - self._parse_layer_config(layer_config) # must place after model init + self._parse_layer_config(layer_config, fp_layers) # Must place after model init self.to_quant_block_names = to_quant_block_names @@ -611,7 +604,7 @@ def _expand_layer_config( # Return whether there are quantized layers outside the blocks return has_qlayer_outside_block - def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers) -> None: + def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers:str) -> None: """Parse and set the layer-wise quantization configuration.""" not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers) if len(not_quantize_layer_names) > 0: diff --git a/auto_round/utils.py b/auto_round/utils.py index a48751d3e..1cb36f2fb 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -1046,7 +1046,7 @@ def can_pack_with_numba(): # pragma: no cover return True -def get_fp_layer_names(model, fp_layers): +def get_fp_layer_names(model:torch.nn.Module, fp_layers:str): """Identifies and returns layers in the model to exclude from quantization. This function processes a comma-separated list of fully precision (FP) layers, From c9fa4088aee6d24b05993ea2d3766105ab6793db Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Sep 2025 05:34:44 +0000 Subject: [PATCH 11/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 4 +++- auto_round/utils.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 8c0b699a2..bc83e041a 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -604,7 +604,9 @@ def _expand_layer_config( # Return whether there are quantized layers outside the blocks return has_qlayer_outside_block - def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers:str) -> None: + def _parse_layer_config( + self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers: str + ) -> None: """Parse and set the layer-wise quantization configuration.""" not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers) if len(not_quantize_layer_names) > 0: diff --git a/auto_round/utils.py b/auto_round/utils.py index 1cb36f2fb..8525d7a88 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -1046,7 +1046,7 @@ def can_pack_with_numba(): # pragma: no cover return True -def get_fp_layer_names(model:torch.nn.Module, fp_layers:str): +def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): """Identifies and returns layers in the model to exclude from quantization. This function processes a comma-separated list of fully precision (FP) layers, From 6453200001920d6c9a0402680aaf4507bc45924a Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 26 Sep 2025 13:41:32 +0800 Subject: [PATCH 12/35] fix --- auto_round/compressors/base.py | 2 +- auto_round/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 8c0b699a2..2512ef5e8 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2158,7 +2158,7 @@ def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> b # Process regex in layer_config all_supported_layer_names = [] # List of configuration keys - keys = (f.name for f in fields(QuantizationScheme)) + ("scale_dtype") + keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype") for n, m in model.named_modules(): # Delete previous configuration to avoid conflicts with prior tuning diff --git a/auto_round/utils.py b/auto_round/utils.py index 1cb36f2fb..9bbdbc161 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2280,7 +2280,7 @@ def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16): if layer.bias is not None: new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype)) scheme_keys = (f.name for f in fields(QuantizationScheme)) - keys = scheme_keys + ("tmp_name", "scale_dtype") + keys = tuple(scheme_keys) + ("tmp_name", "scale_dtype") for key in keys: setattr(new_layer, key, getattr(layer, key, None)) From 3811010768472c9da67b13213dc1d571d457a8cd Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 26 Sep 2025 14:07:19 +0800 Subject: [PATCH 13/35] tmp_change --- auto_round/compressors/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 48814853b..c7c3abe40 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -416,7 +416,7 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different def _convert_value_layer_config_to_dict( - self, layer_config: dict[str, Union[str, dict, QuantizationScheme]] + self, layer_config: dict[str, Union[str, dict, QuantizationScheme]],default_scheme:QuantizationScheme, ) -> dict: new_layer_config = {} if layer_config is None else layer_config @@ -441,6 +441,7 @@ def _convert_value_layer_config_to_dict( f" only {scheme_keys} are supported" ) new_layer_config[key]["fixed_by_user"] = True + return new_layer_config def _expand_layer_config( From 4de7b0879cba422eac13532ed716df816b06c6ba Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 26 Sep 2025 14:43:26 +0800 Subject: [PATCH 14/35] commit --- auto_round/compressors/base.py | 53 ++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c7c3abe40..067cddeda 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -416,31 +416,48 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different def _convert_value_layer_config_to_dict( - self, layer_config: dict[str, Union[str, dict, QuantizationScheme]],default_scheme:QuantizationScheme, + self, + layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], + default_scheme: "QuantizationScheme", + use_auto_mixed_bit_in_gguf: bool = False, ) -> dict: + """ + Convert layer_config values (string, dict, QuantizationScheme) into a standardized dict format. + Adds 'fixed_by_user': True for each processed layer config. + """ + if layer_config is None: + return {} + + scheme_keys = {f.name for f in fields(QuantizationScheme)} + new_layer_config = copy.deepcopy(layer_config) - new_layer_config = {} if layer_config is None else layer_config - scheme_keys = [f.name for f in fields(QuantizationScheme)] for key, item in new_layer_config.items(): if isinstance(item, str): - item = asdict(preset_name_to_scheme(item.upper())) - new_layer_config[key] = item + # Convert preset name to scheme dict + config = asdict(preset_name_to_scheme(item.upper())) elif isinstance(item, QuantizationScheme): config = asdict(item) - tmp_keys = copy.deepcopy(list(config.keys())) - for tmp_key in tmp_keys: # Pop None value to be overridden - if config[tmp_key] is None: - config.pop(tmp_key) elif isinstance(item, dict): - item_keys = item.keys() - if item_keys not in scheme_keys: - for item_key in item_keys: - if item_key not in scheme_keys: - raise ValueError( - f"the key {item_key} in layer_config for layer {key} is invalid," - f" only {scheme_keys} are supported" - ) - new_layer_config[key]["fixed_by_user"] = True + # Validate dict keys + invalid_keys = set(item) - scheme_keys + if invalid_keys: + raise ValueError( + f"Invalid keys {invalid_keys} in layer_config for layer '{key}', " + f"only {scheme_keys} are supported." + ) + config = dict(item) + else: + raise TypeError( + f"Unsupported type for layer_config[{key}]: {type(item)}. " + f"Expected str, dict, or QuantizationScheme." + ) + + # Drop None values + config = {k: v for k, v in config.items() if v is not None} + + # Mark as user-fixed + config["fixed_by_user"] = True + new_layer_config[key] = config return new_layer_config From a9f0e444fff29ca12aafd6dd24bc7e8a933534c1 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 26 Sep 2025 14:44:44 +0800 Subject: [PATCH 15/35] commit --- auto_round/compressors/base.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 067cddeda..13ef303a0 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -446,11 +446,6 @@ def _convert_value_layer_config_to_dict( f"only {scheme_keys} are supported." ) config = dict(item) - else: - raise TypeError( - f"Unsupported type for layer_config[{key}]: {type(item)}. " - f"Expected str, dict, or QuantizationScheme." - ) # Drop None values config = {k: v for k, v in config.items() if v is not None} From 59a9f5df246da7d9676d6315435b6626da07e582 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 26 Sep 2025 16:39:17 +0800 Subject: [PATCH 16/35] update a little --- auto_round/compressors/base.py | 116 +++++++++++++++++++++++++++------ auto_round/schemes.py | 14 +++- 2 files changed, 109 insertions(+), 21 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 13ef303a0..c12aea15f 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -414,47 +414,125 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: else: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") - # TODO gguf apply mixd bits, so the gguf scheme meanings in scheme and autoscheme are different - def _convert_value_layer_config_to_dict( + def _prepare_layer_config( self, - layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], + model: torch.nn.Module, + orig_layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], default_scheme: "QuantizationScheme", - use_auto_mixed_bit_in_gguf: bool = False, + supported_types, + inner_supported_types, + fp_layers: str = "", + quant_lm_head: bool = False, ) -> dict: """ - Convert layer_config values (string, dict, QuantizationScheme) into a standardized dict format. - Adds 'fixed_by_user': True for each processed layer config. + Normalize and validate layer-specific quantization schemes, + expand regex-based configs, and merge with default scheme. """ - if layer_config is None: - return {} + from auto_round.schemes import is_gguf_scheme scheme_keys = {f.name for f in fields(QuantizationScheme)} - new_layer_config = copy.deepcopy(layer_config) + layer_config = copy.deepcopy(orig_layer_config) or {} + + # Mark layers that should stay in FP + not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers) + for name in not_quantize_layer_names: + layer_config[name] = { + "bits": 16, + "act_bits": 16, + "data_type": "float", + "act_data_type": "float", + } - for key, item in new_layer_config.items(): + def normalize_item(item, layer_name: str) -> dict: + """Convert a single config entry to dict and validate keys.""" if isinstance(item, str): - # Convert preset name to scheme dict config = asdict(preset_name_to_scheme(item.upper())) elif isinstance(item, QuantizationScheme): config = asdict(item) elif isinstance(item, dict): - # Validate dict keys invalid_keys = set(item) - scheme_keys if invalid_keys: raise ValueError( - f"Invalid keys {invalid_keys} in layer_config for layer '{key}', " + f"Invalid keys {invalid_keys} in layer_config for layer '{layer_name}', " f"only {scheme_keys} are supported." ) config = dict(item) - - # Drop None values + else: + raise TypeError( + f"Unsupported type for layer_config[{layer_name}]: {type(item)}. " + f"Expected str, dict, or QuantizationScheme." + ) + # Drop None values & mark as fixed config = {k: v for k, v in config.items() if v is not None} - - # Mark as user-fixed config["fixed_by_user"] = True - new_layer_config[key] = config + return config + + # Normalize configs + layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} + + # Infer missing bits from data_type / act_data_type + for cfg in layer_config.values(): + if "data_type" in cfg and "bits" not in cfg: + if (tmp_bits := infer_bits_by_data_type(cfg["data_type"])) is not None: + cfg["bits"] = tmp_bits + if "act_data_type" in cfg and "act_bits" not in cfg: + if (tmp_bits := infer_bits_by_data_type(cfg["act_data_type"])) is not None: + cfg["act_bits"] = tmp_bits + + # Fill missing values from default scheme + default_dict = asdict(default_scheme) + for cfg in layer_config.values(): + for scheme_key in scheme_keys: + cfg.setdefault(scheme_key, default_dict.get(scheme_key)) + + # Special case for GGUF + is_gguf = is_gguf_scheme(default_scheme) + if is_gguf and torch.nn.Embedding not in supported_types: + supported_types = tuple(list(supported_types) + [torch.nn.Embedding]) + + # Collect all supported layer names + all_supported_layer_names = [] + for n, m in model.named_modules(): + # Clear old attributes to avoid conflicts + for key in scheme_keys: + if hasattr(m, key): + delattr(m, key) + if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types: + continue + all_supported_layer_names.append(n) + + # Expand regex configs (compile once, reuse) + for name in list(layer_config.keys()): + if name in all_supported_layer_names: + continue + regex = re.compile(name) + matched_names = [ln for ln in all_supported_layer_names if regex.search(ln)] + if matched_names: + val = layer_config.pop(name) + for match_name in matched_names: + layer_config[match_name] = val + else: + raise ValueError(f"Key '{name}' in layer_config is invalid, please double check.") + + # Enforce group_size = 32 constraint for INT weight-only quantization + if default_scheme.data_type == "int" and default_scheme.act_bits >= 16 and not is_gguf: + for n, m in model.named_modules(): + if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: + if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: + if n in layer_config: + layer_config[n]["bits"] = 16 + layer_config[n]["data_type"] = "fp" + logger.warning_once( + f"{n} will not be quantized because its shape is not divisible by 32. " + "It will be exported in FP16 instead." + ) + + # Handle lm_head + lm_head_name = get_lm_head_name(model) + if lm_head_name not in layer_config and (quant_lm_head or is_gguf): + layer_config[lm_head_name] = default_dict.copy() - return new_layer_config + return layer_config def _expand_layer_config( self, diff --git a/auto_round/schemes.py b/auto_round/schemes.py index af51a881e..9c12b61c0 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -14,9 +14,9 @@ import copy from copy import deepcopy from dataclasses import dataclass, fields -from typing import Iterable, Optional +from typing import Iterable, Optional, Union -__all__ = ["QuantizationScheme", "preset_name_to_scheme", "AutoScheme"] +__all__ = ["QuantizationScheme", "is_gguf_scheme", "preset_name_to_scheme", "AutoScheme"] @dataclass @@ -235,6 +235,16 @@ def is_preset_scheme(name: str) -> bool: value.pop("lm_head", None) PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value) +def is_gguf_scheme(scheme:Union[str, QuantizationScheme])->bool: + if isinstance(scheme,str) and scheme.upper().startswith("GGUF"): + return True + for key, val in PRESET_SCHEMES.items(): + if not key.upper().startswith("GGUF"): + continue + if val==scheme: + return True + return False + @dataclass class AutoScheme: From 1b7e911656995558c6ea900a09f97f705cadd089 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Sep 2025 08:39:58 +0000 Subject: [PATCH 17/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 16 ++++++++-------- auto_round/schemes.py | 7 ++++--- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c12aea15f..e8bb2fad2 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -415,14 +415,14 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") def _prepare_layer_config( - self, - model: torch.nn.Module, - orig_layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], - default_scheme: "QuantizationScheme", - supported_types, - inner_supported_types, - fp_layers: str = "", - quant_lm_head: bool = False, + self, + model: torch.nn.Module, + orig_layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], + default_scheme: "QuantizationScheme", + supported_types, + inner_supported_types, + fp_layers: str = "", + quant_lm_head: bool = False, ) -> dict: """ Normalize and validate layer-specific quantization schemes, diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 9c12b61c0..97a3cdf02 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -235,13 +235,14 @@ def is_preset_scheme(name: str) -> bool: value.pop("lm_head", None) PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value) -def is_gguf_scheme(scheme:Union[str, QuantizationScheme])->bool: - if isinstance(scheme,str) and scheme.upper().startswith("GGUF"): + +def is_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> bool: + if isinstance(scheme, str) and scheme.upper().startswith("GGUF"): return True for key, val in PRESET_SCHEMES.items(): if not key.upper().startswith("GGUF"): continue - if val==scheme: + if val == scheme: return True return False From e0680493b54b9e64e0eb8d26219435c2c3f58170 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 26 Sep 2025 16:44:08 +0800 Subject: [PATCH 18/35] fix --- auto_round/compressors/base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c12aea15f..7538fc362 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -519,12 +519,13 @@ def normalize_item(item, layer_name: str) -> dict: for n, m in model.named_modules(): if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: - if n in layer_config: - layer_config[n]["bits"] = 16 - layer_config[n]["data_type"] = "fp" + if n not in layer_config: + layer_config[n] = default_dict.copy() + layer_config[n]["bits"] = 16 + layer_config[n]["data_type"] = "fp" + layer_config[n]["fixed_by_user"] = True logger.warning_once( f"{n} will not be quantized because its shape is not divisible by 32. " - "It will be exported in FP16 instead." ) # Handle lm_head From 0357c0b94b7070da52c9492212be961fab69994e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Sep 2025 08:45:30 +0000 Subject: [PATCH 19/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index a1558d247..182cc435f 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -524,9 +524,7 @@ def normalize_item(item, layer_name: str) -> dict: layer_config[n]["bits"] = 16 layer_config[n]["data_type"] = "fp" layer_config[n]["fixed_by_user"] = True - logger.warning_once( - f"{n} will not be quantized because its shape is not divisible by 32. " - ) + logger.warning_once(f"{n} will not be quantized because its shape is not divisible by 32. ") # Handle lm_head lm_head_name = get_lm_head_name(model) From 602421c6ab2340476ece3a6959409b7d58be8320 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 26 Sep 2025 18:22:47 +0800 Subject: [PATCH 20/35] merge autoscheme to scheme --- auto_round/compressors/base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 8dbb0e3cc..0e21e00d5 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -128,8 +128,7 @@ def __init__( self, model: Union[torch.nn.Module, str], tokenizer=None, - scheme: Union[str, dict, QuantizationScheme] = "W4A16", - auto_scheme: AutoScheme = None, + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", iters: int = 200, @@ -2247,7 +2246,7 @@ def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> b # Process regex in layer_config all_supported_layer_names = [] # List of configuration keys - keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype") + keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) for n, m in model.named_modules(): # Delete previous configuration to avoid conflicts with prior tuning From 091c5ad0e9045dbcc693aa3163ca4ec476ddf44f Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 29 Sep 2025 14:31:43 +0800 Subject: [PATCH 21/35] refine layer_config code --- auto_round/__main__.py | 10 + auto_round/autoround.py | 4 +- auto_round/compressors/base.py | 635 +++++++++------------------------ auto_round/schemes.py | 8 +- auto_round/utils.py | 18 +- 5 files changed, 191 insertions(+), 484 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 07bc3f273..43f55a050 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -470,6 +470,15 @@ def tune(args): extra_config.scheme_config = scheme_config extra_config.mllm_config = mllm_config + layer_config = {} + # from auto_round.auto_schemes.haha import get_mixed_config_layer_config + # layer_config = {} + # best_path = get_mixed_config_layer_config(model_name, target_bits=3) + # for item in best_path: + # layer_config[item[0]] = {} + # layer_config[item[0]]["bits"] = item[1] + + autoround: BaseCompressor = AutoRound( model=model_name, scheme=scheme, @@ -486,6 +495,7 @@ def tune(args): not_use_best_mse=args.not_use_best_mse, enable_adam=args.adam, extra_config=extra_config, + layer_config=layer_config, ) model_name = args.model.rstrip("/") diff --git a/auto_round/autoround.py b/auto_round/autoround.py index ae1a37677..ccdca1f09 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -63,8 +63,7 @@ def __new__( cls, model: Union[torch.nn.Module, str], tokenizer=None, - scheme: Union[str, dict, QuantizationScheme] = "W4A16", - auto_scheme: AutoScheme = None, + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", iters: int = 200, @@ -159,7 +158,6 @@ def __new__( model=model, tokenizer=tokenizer, scheme=scheme, - auto_scheme=auto_scheme, layer_config=layer_config, dataset=dataset, iters=iters, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 0e21e00d5..590c480e8 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -35,7 +35,7 @@ from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType from auto_round.logger import logger from auto_round.low_cpu_mem.utils import get_layers_before_block -from auto_round.schemes import AutoScheme, QuantizationScheme, preset_name_to_scheme +from auto_round.schemes import AutoScheme, QuantizationScheme, preset_name_to_scheme, get_gguf_scheme from auto_round.sign_sgd import SignSGD from auto_round.special_model_handler import _handle_moe_model from auto_round.utils import ( @@ -201,8 +201,7 @@ def __init__( ... # ... ... } """ - self.scheme = None - self._parse_and_set_scheme(scheme, kwargs) + self.scheme = self._parse_and_set_scheme(scheme, kwargs) # Extra/legacy kwargs for backward compatibility # Major version releases may pack them with extra configuration options amp = kwargs.pop("amp", True) @@ -229,7 +228,8 @@ def __init__( self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False # Scale factor for RAM usage per parameter. self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) - fp_layers = kwargs.pop("fp_layers", "") + self.fp_layers = kwargs.pop("fp_layers", "") + self.layer_config = layer_config if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") @@ -287,7 +287,7 @@ def __init__( self.device_map = None self._set_device_map_in_blocks(self.device_map) - self._parse_layer_config(layer_config, fp_layers) # Must place after model init + # self._parse_layer_config(layer_config, fp_layers) # Must place after model init self.to_quant_block_names = to_quant_block_names @@ -414,46 +414,46 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") def _prepare_layer_config( - self, - model: torch.nn.Module, - orig_layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], - default_scheme: "QuantizationScheme", - supported_types, - inner_supported_types, - fp_layers: str = "", - quant_lm_head: bool = False, - ) -> dict: + self, + model: torch.nn.Module, + layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], + default_scheme: "QuantizationScheme", + default_scale_dtype: torch.dtype | str, + supported_types: tuple, + inner_supported_types: tuple, + quant_block_list=None, + fp_layers: str = "", + quant_lm_head: bool = False, + enable_gguf_official_mixed: bool = True, + is_mllm: bool = False, + ) -> tuple[dict, bool]: """ - Normalize and validate layer-specific quantization schemes, - expand regex-based configs, and merge with default scheme. + Normalize, validate, and expand layer-specific quantization configs. + Returns (final_layer_config, has_quant_layer_outside_block) """ - from auto_round.schemes import is_gguf_scheme - scheme_keys = {f.name for f in fields(QuantizationScheme)} - layer_config = copy.deepcopy(orig_layer_config) or {} + from auto_round.schemes import get_gguf_scheme - # Mark layers that should stay in FP - not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers) - for name in not_quantize_layer_names: - layer_config[name] = { - "bits": 16, - "act_bits": 16, - "data_type": "float", - "act_data_type": "float", - } + # ---- helpers ------------------------------------------------- + def dispatch_layer_config(layer_config: dict[str, dict]) -> None: + """Assign scheme values as attributes to matched modules.""" + for layer_name, scheme in layer_config.items(): + module = get_module(model, layer_name) + for attr, value in scheme.items(): + setattr(module, attr, value) - def normalize_item(item, layer_name: str) -> dict: - """Convert a single config entry to dict and validate keys.""" + def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict: + """Convert config entry into dict and validate keys.""" if isinstance(item, str): config = asdict(preset_name_to_scheme(item.upper())) elif isinstance(item, QuantizationScheme): config = asdict(item) elif isinstance(item, dict): - invalid_keys = set(item) - scheme_keys - if invalid_keys: + invalid = set(item) - set(scheme_keys) + if invalid: raise ValueError( - f"Invalid keys {invalid_keys} in layer_config for layer '{layer_name}', " - f"only {scheme_keys} are supported." + f"Invalid keys {invalid} in layer_config for '{layer_name}'. " + f"Allowed keys: {scheme_keys}" ) config = dict(item) else: @@ -461,237 +461,135 @@ def normalize_item(item, layer_name: str) -> dict: f"Unsupported type for layer_config[{layer_name}]: {type(item)}. " f"Expected str, dict, or QuantizationScheme." ) - # Drop None values & mark as fixed + # Clean up config = {k: v for k, v in config.items() if v is not None} config["fixed_by_user"] = True return config - # Normalize configs + # ---- main logic ---------------------------------------------- + scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) + layer_config = copy.deepcopy(layer_config) or {} + + # 1. fp_layers -> force 16 + for name in get_fp_layer_names(self.model, fp_layers): + layer_config[name] = { + "bits": 16, "act_bits": 16, + "data_type": "float", "act_data_type": "float" + } + + # 2. normalize layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} - # Infer missing bits from data_type / act_data_type + # 3. infer missing bits for cfg in layer_config.values(): if "data_type" in cfg and "bits" not in cfg: - if (tmp_bits := infer_bits_by_data_type(cfg["data_type"])) is not None: - cfg["bits"] = tmp_bits + if (b := infer_bits_by_data_type(cfg["data_type"])) is not None: + cfg["bits"] = b if "act_data_type" in cfg and "act_bits" not in cfg: - if (tmp_bits := infer_bits_by_data_type(cfg["act_data_type"])) is not None: - cfg["act_bits"] = tmp_bits + if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None: + cfg["act_bits"] = b - # Fill missing values from default scheme + # 4. fill defaults default_dict = asdict(default_scheme) + default_dict["scale_dtype"] = default_scale_dtype for cfg in layer_config.values(): - for scheme_key in scheme_keys: - cfg.setdefault(scheme_key, default_dict.get(scheme_key)) + for key in scheme_keys: + cfg.setdefault(key, default_dict.get(key)) - # Special case for GGUF - is_gguf = is_gguf_scheme(default_scheme) - if is_gguf and torch.nn.Embedding not in supported_types: - supported_types = tuple(list(supported_types) + [torch.nn.Embedding]) + # 5. collect supported modules + gguf_name = get_gguf_scheme(default_scheme) + if gguf_name and torch.nn.Embedding not in supported_types: + supported_types = (*supported_types, torch.nn.Embedding) - # Collect all supported layer names - all_supported_layer_names = [] + all_layer_names, embedding_layer_names = [], [] for n, m in model.named_modules(): - # Clear old attributes to avoid conflicts + # cleanup stale attributes for key in scheme_keys: if hasattr(m, key): delattr(m, key) if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types: continue - all_supported_layer_names.append(n) + all_layer_names.append(n) + if isinstance(m, torch.nn.Embedding): + embedding_layer_names.append(n) - # Expand regex configs (compile once, reuse) + # 6. expand regex configs for name in list(layer_config.keys()): - if name in all_supported_layer_names: + if name in all_layer_names: continue regex = re.compile(name) - matched_names = [ln for ln in all_supported_layer_names if regex.search(ln)] - if matched_names: - val = layer_config.pop(name) - for match_name in matched_names: - layer_config[match_name] = val - else: - raise ValueError(f"Key '{name}' in layer_config is invalid, please double check.") - - # Enforce group_size = 32 constraint for INT weight-only quantization - if default_scheme.data_type == "int" and default_scheme.act_bits >= 16 and not is_gguf: - for n, m in model.named_modules(): - if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: - if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: - if n not in layer_config: - layer_config[n] = default_dict.copy() - layer_config[n]["bits"] = 16 - layer_config[n]["data_type"] = "fp" - layer_config[n]["fixed_by_user"] = True - logger.warning_once(f"{n} will not be quantized because its shape is not divisible by 32. ") - - # Handle lm_head + matched = [ln for ln in all_layer_names if regex.search(ln)] + if not matched: + raise ValueError(f"Invalid regex '{name}' in layer_config, no match found.") + val = layer_config.pop(name) + for match in matched: + layer_config[match] = val + + # 7. lm_head lm_head_name = get_lm_head_name(model) - if lm_head_name not in layer_config and (quant_lm_head or is_gguf): + tied_lm_head = False + if ( + hasattr(model, "config") + and model.config.tie_word_embeddings + and hasattr(model, "_tied_weights_keys") + ): + tied_keys =model._tied_weights_keys + if lm_head_name in tied_keys: + tied_lm_head=True + if quant_lm_head and tied_lm_head: + quant_lm_head=False + logger.warning("reset `quant_lm_head` to false as quantizing lm_head with tied weights has not been supported currently") + + if lm_head_name not in layer_config and quant_lm_head: layer_config[lm_head_name] = default_dict.copy() - return layer_config - - def _expand_layer_config( - self, - model: torch.nn.Module, - layer_config: dict[str, dict], - fp_layers, - quant_lm_head, - scheme, - quant_block_list, - supported_types, - inner_supported_types, - ): - """ - Sets the layer-wise configuration based on the provided `layer_config`. - By default, only quantize layers in blocks. - - Args: - layer_config (dict): The configuration dictionary for each layer containing various configuration options. - - Returns: - bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head), - otherwise returns False. - """ - - # set fp layers - not_quantize_layer_names = get_fp_layer_names(model, fp_layers) - # if len(not_quantize_layer_names) > 0: - # logger.info(f"{not_quantize_layer_names} will not be quantized.") - if layer_config is None: - layer_config = {} - for name in not_quantize_layer_names: - layer_config[name] = { - "bits": 16, - "act_bits": 16, - "data_type": "float", - "act_data_type": "float", - "fixed_by_user": True, - } - - # Get the names of layers in quantization blocks - layers_in_blocks = get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types) - # Process regex in layer_config - all_supported_layer_names = [] - # List of configuration keys - scheme_keys = (f.name for f in fields(QuantizationScheme)) - - for n, m in model.named_modules(): - # Delete previous configuration to avoid conflicts with prior tuning - for key in scheme_keys: - if hasattr(m, key): - delattr(m, key) - if type(m) not in supported_types and m.__class__.__name__ not in self.inner_supported_types: - continue - all_supported_layer_names.append(n) - - names_in_layer_config = list(layer_config.keys()) - for name in names_in_layer_config: - if name in all_supported_layer_names: - continue - matched_names = [] - for layer_name in all_supported_layer_names: - if re.search(re.compile(name), layer_name) is not None: - matched_names.append(layer_name) - if len(matched_names) > 0: - val = layer_config[name] - layer_config.pop(name) - for match_name in matched_names: - layer_config[match_name] = val - else: - tmp_m = get_module(model, name) - if type(tmp_m) != torch.nn.Embedding: # GGUF needs to quantize embedding layer - raise ValueError(f"key {name} in layer_config is invalid, please have a double check") - - has_qlayer_outside_block = False # Flag to track if there are quantized layers outside blocks (e.g., lm-head) - - # Iterate through all modules in the model - is_gguf = "gguf" in scheme.lower() or ( - hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats) - ) - for n, m in model.named_modules(): - # Skip unsupported types - if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types: - if n in layer_config: - if not isinstance(m, torch.nn.Embedding): - logger.warning(f"{n} is not supported, layer_config {n}: {layer_config[n]} will be ignored.") - layer_config.pop(n) - - if not is_gguf: # TODO the code here seems to could be deleted - if not check_to_quantized(layer_config[n]): - layer_config.pop(n) - - continue - - # If the layer is not in the config and is part of a quantization block, use default configuration - if n not in layer_config.keys() and n in layers_in_blocks: - layer_config[n] = {} - for key in scheme_keys: - layer_config[n][key] = getattr(self, key) - - # If the layer is partially configured, fill in missing values - elif n in layer_config.keys(): - if "data_type" in layer_config[n] and "bits" not in layer_config[n]: - tmp_bits = infer_bits_by_data_type(layer_config[n]["data_type"]) - if tmp_bits is not None and tmp_bits != self.bits: - logger.warning( - f"'data_type' do not match the specified 'bits' setting for {n}." - f" Resetting 'bits' to {tmp_bits}." - ) - layer_config[n]["bits"] = tmp_bits - if "act_data_type" in layer_config[n] and "act_bits" not in layer_config[n]: - tmp_bits = infer_bits_by_data_type(layer_config[n]["act_data_type"]) - if tmp_bits is not None and tmp_bits != self.act_bits: - logger.warning( - f"'act_data_type' do not match the specified 'act_bits' setting for {n}." - f" Resetting 'act_bits' to {tmp_bits}." - ) - layer_config[n]["act_bits"] = tmp_bits - - for key in scheme_keys: - if key not in layer_config[n].keys(): - layer_config[n][key] = getattr(self, key) - layer_config[n]["fixed_by_user"] = True - - # If the layer is not in the config and not part of a quantization block, - # use default configuration and set specific values - else: - layer_config[n] = {} - for key in scheme_keys: - layer_config[n][key] = getattr(self, key) - layer_config[n]["bits"] = 16 - layer_config[n]["act_bits"] = 16 - - if n in layers_in_blocks: - layer_config[n]["in_blocks"] = True - else: - layer_config[n]["in_blocks"] = False - - # If the layer is outside a block and requires quantization, mark it as a quantized layer outside the block - if ( - n not in layers_in_blocks - and check_to_quantized(layer_config[n]) - and not isinstance(m, torch.nn.Embedding) - ): + # 8. enforce shape divisibility for int weight-only + if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name: + for n, m in model.named_modules(): + if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: + if m.weight.shape[0] % 32 or m.weight.shape[1] % 32: + layer_config.setdefault(n, default_dict.copy()) + layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True}) + logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).") + + # 9. block layers: mark as in_blocks=True + for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): + cfg = layer_config.setdefault(name, default_dict.copy()) + cfg["in_blocks"] = True + + # ---- restore: ensure missing in_blocks are set to False and compute flag ---- + has_qlayer_outside_block = False + for cfg in layer_config.values(): + if "in_blocks" not in cfg: + cfg["in_blocks"] = False + # 如果 layer 不在 blocks 且需要量化,则标记存在 blocks 外的量化层 + if not cfg["in_blocks"] and check_to_quantized(cfg): has_qlayer_outside_block = True - in_features, out_features = get_layer_features(m) - if in_features <= layer_config[n]["group_size"]: - layer_config[n]["group_size"] = -1 + # 10. GGUF handling + if not gguf_name: + dispatch_layer_config(layer_config) + return layer_config, has_qlayer_outside_block - # Apply the configuration to the corresponding layer in the model - for key in scheme_keys: - setattr(m, key, layer_config[n][key]) + # embed + lm_head defaults for gguf + if lm_head_name not in layer_config and not tied_lm_head: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] + cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} + layer_config[lm_head_name] = cfg + has_qlayer_outside_block = True + for emd_name in embedding_layer_names: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]] + cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} + layer_config[emd_name] = cfg - # TODO self.quant_lm_head has not handleed yet + if enable_gguf_official_mixed: + model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT + layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type) + + dispatch_layer_config(layer_config) + return layer_config, has_qlayer_outside_block - need_to_quantize_lm_head = self._check_need_to_quantize_lm_head_embedding() - if need_to_quantize_lm_head: - has_qlayer_outside_block = True - # Return whether there are quantized layers outside the blocks - return has_qlayer_outside_block def _parse_layer_config( self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers: str @@ -753,7 +651,7 @@ def _parse_layer_config( if key not in lm_head_layer_config: lm_head_layer_config[key] = getattr(self, key) - def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> None: + def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme: """Parse and set the quantization scheme.""" if isinstance(scheme, QuantizationScheme): scheme = asdict(scheme) @@ -761,7 +659,6 @@ def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kw scheme = scheme elif isinstance(scheme, str): scheme = scheme.upper() - self.scheme = scheme scheme = asdict(preset_name_to_scheme(scheme)) scheme_keys = [f.name for f in fields(QuantizationScheme)] for key in scheme_keys: @@ -807,6 +704,9 @@ def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kw if supported_dtype + str(tmp_act_bits) == self.act_data_type: # could not replace FP8_e4m3 self.act_data_type = supported_dtype break + for key in scheme_keys: + scheme[key] = getattr(self, key) + return QuantizationScheme.from_dict(scheme) def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: """Sets the torch compile configuration for the tuning.""" @@ -1112,19 +1012,29 @@ def remove_duplicates(lst): formats = format.replace("q*_", f"q{self.bits}_").replace(" ", "").split(",") formats = remove_duplicates(formats) # need the keep origin order - if isinstance(self.scheme, str) and self.scheme.lower().startswith("gguf"): + gguf_format_name = get_gguf_scheme(self.scheme) + + if gguf_format_name: for i in range(len(formats)): - if formats[i] != "fake" and formats[i] != self.scheme.lower(): + if formats[i] != "fake" and formats[i] != gguf_format_name.lower(): logger.warning( - f"reset format {formats[i]} to {self.scheme.lower()} " - f"since scheme {self.scheme} can only be exported to format {self.scheme.lower()}" + f"reset format {formats[i]} to {gguf_format_name.lower()} " + f"since scheme {gguf_format_name} can only be exported to format {gguf_format_name.lower()}" ) - formats[i] = self.scheme.lower() + formats[i] = gguf_format_name.lower() + _gguf_args_check(self, formats, model_type=ModelType.TEXT) if self.mllm: _gguf_args_check(self, formats, model_type=ModelType.MMPROJ) + for f in formats: + if f.startswith("gguf"): + self.scheme = preset_name_to_scheme(f) + break + + + for format_ in formats: if format_ not in SUPPORTED_FORMATS: logger.error(f"Unsupported format {format_}, please choose from {SUPPORTED_FORMATS}") @@ -1608,91 +1518,6 @@ def get_imatrix_hook(module, input, output): for hook in hooks: hook.remove() - def _check_need_to_quantize_lm_head_embedding(self) -> bool: - """Checks if LM head and embedding layers need quantization for GGUF format. - - This function inspects the current model's formats and determines whether - it needs to apply quantization settings to the embedding and LM head layers. - The function modifies `self.layer_config` in-place and updates the model modules. - - Returns: - bool: True if the LM head needs quantization, otherwise False. - - Raises: - NotImplementedError: If multiple non-fake GGUF formats are specified. - """ - gguf_scheme = False - if isinstance(self.scheme, str) and "gguf" in self.scheme.lower(): - gguf_scheme = True - - if not hasattr(self, "formats") and not gguf_scheme: - return False - - has_gguf: bool = gguf_scheme or any("gguf" in fmt for fmt in self.formats) - if not has_gguf: - return False - if hasattr(self, "formats"): - formats: list[str] = [fmt for fmt in self.formats if "fake" not in fmt] - if not (len(formats) == 1 and "gguf" in formats[0]): - raise NotImplementedError("Only one GGUF format can be set at a time.") - target_format: str = formats[0] - - else: - target_format = self.scheme.lower() - - tie_word_embeddings: bool = getattr(getattr(self.model, "config", None), "tie_word_embeddings", True) - for name, module in self.model.named_modules(): - if isinstance(module, torch.nn.Embedding): - key: str = "lm_head" if tie_word_embeddings else "embedding" - config: dict[str, Any] = GGUF_INNER_CONFIG[GGUF_CONFIG[target_format][key]] - self._apply_config_to_layer(name, config, True) - - if not tie_word_embeddings: - lm_head_name: str = get_lm_head_name(self.model) - config: dict[str, Any] = GGUF_CONFIG[GGUF_CONFIG[target_format]["lm_head"]] - check_fixed_by_user = ( - self.layer_config[lm_head_name].get("fixed_by_user", False) - if lm_head_name in self.layer_config - else None - ) - self._apply_config_to_layer(lm_head_name, config, check_fixed_by_user=check_fixed_by_user) - return True - - return False - - def _apply_config_to_layer( - self, - layer_name: str, - config: dict[str, Any], - check_fixed_by_user: bool = False, - ) -> None: - """Applies GGUF quantization configuration to a given layer. - - Args: - layer_name (str): Name of the layer to configure. - config (dict[str, Any]): GGUF layer configuration. - check_fixed_by_user (bool): If True, preserve user-defined settings. - """ - act_bits: int = 16 - scale_dtype: Any = self.scale_dtype - keys: list[str] = ["bits", "group_size", "super_bits", "super_group_size", "data_type", "sym"] - - self.layer_config[layer_name] = self.layer_config.get(layer_name, {}) - - for key in keys: - if ( - key in self.layer_config[layer_name] - and check_fixed_by_user - # and self.layer_config[layer_name].get("fixed_by_user", False) - ): - continue - self.layer_config[layer_name][key] = config.get(key) - setattr(get_module(self.model, layer_name), key, config.get(key)) - - self.layer_config[layer_name]["act_bits"] = act_bits - self.layer_config[layer_name]["scale_dtype"] = scale_dtype - setattr(get_module(self.model, layer_name), "act_bits", act_bits) - setattr(get_module(self.model, layer_name), "scale_dtype", scale_dtype) def _quantize_layer_via_rtn(self, name: str) -> None: """Quantizes a layer using RTN (Round-To-Nearest) if available. @@ -1993,14 +1818,21 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: Returns: The quantized model and layer configurations. """ - for n, m in self.model.named_modules(): + for n, m in self.model.named_modules(): # TODO check if could removed m.tmp_name = n self._check_compatibility() formats = self.formats if hasattr(self, "formats") else None # It is best to modify the model structure in the quantize function and check the format, # because it may cause the gguf format to not be exported normally. self.model = _handle_moe_model(self.model, formats=formats) - self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config) + # self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config) + # TODO check scale_dtype + self.layer_config, self.has_qlayer_outside_block = ( + self._prepare_layer_config(self.model, self.layer_config,self.scheme, self.scale_dtype, + self.supported_types,self.inner_supported_types,self.quant_block_list, + self.fp_layers,self.quant_lm_head, + enable_gguf_official_mixed=True,is_mllm=self.mllm)) + if not hasattr(self, "formats"): logger.warning("this API is deprecated, please use `quantize_and_save` instead") else: @@ -2011,14 +1843,14 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: break if len(self.formats) == 1 and self.formats[0] == "fake": only_gguf = False - if only_gguf: - self.layer_config, gguf_format_config = get_layer_config_by_gguf_format( - self.layer_config, self.formats, self.model, model_type=ModelType.TEXT - ) - if self.mllm: - self.layer_config, gguf_format_config = get_layer_config_by_gguf_format( - self.layer_config, self.formats, self.model, model_type=ModelType.MMPROJ - ) + # if only_gguf: + # self.layer_config, gguf_format_config = get_layer_config_by_gguf_format( + # self.layer_config, self.formats, self.model, model_type=ModelType.TEXT + # ) + # if self.mllm: + # self.layer_config, gguf_format_config = get_layer_config_by_gguf_format( + # self.layer_config, self.formats, self.model, model_type=ModelType.MMPROJ + # ) # Determine if immediate packing is required formats = self.formats if ( @@ -2226,141 +2058,6 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: del layer_input clear_memory(q_layer_input) - def _set_layerwise_config(self, model: torch.nn.Module, layer_config: dict) -> bool: - """ - Sets the layer-wise configuration based on the provided `layer_config`. - By default, only quantize layers in blocks. - - Args: - layer_config (dict): The configuration dictionary for each layer containing various configuration options. - - Returns: - bool: Returns True if there are quantized layers outside the blocks (e.g., lm-head), - otherwise returns False. - """ - # Get the names of layers in quantization blocks - supported_types = self.supported_types - layers_in_blocks = get_layer_names_in_block( - model, supported_types, self.quant_block_list, self.inner_supported_types - ) - # Process regex in layer_config - all_supported_layer_names = [] - # List of configuration keys - keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) - - for n, m in model.named_modules(): - # Delete previous configuration to avoid conflicts with prior tuning - for key in keys: - if hasattr(m, key): - delattr(m, key) - - if not isinstance(m, supported_types) and m.__class__.__name__ not in self.inner_supported_types: - continue - all_supported_layer_names.append(n) - - names_in_layer_config = list(layer_config.keys()) - for name in names_in_layer_config: - if name in all_supported_layer_names: - continue - matched_names = [] - for layer_name in all_supported_layer_names: - if re.search(re.compile(name), layer_name) is not None: - matched_names.append(layer_name) - if len(matched_names) > 0: - val = layer_config[name] - layer_config.pop(name) - for match_name in matched_names: - layer_config[match_name] = val - else: - tmp_m = get_module(model, name) - if not isinstance(tmp_m, torch.nn.Embedding): # TODO not good code style - raise ValueError(f"key {name} in layer_config is invalid, please have a double check") - - has_qlayer_outside_block = False # Flag to track if there are quantized layers outside blocks (e.g., lm-head) - - # Iterate through all modules in the model - is_gguf = hasattr(self, "formats") and any("gguf" in format_ for format_ in self.formats) - for n, m in model.named_modules(): - # Skip unsupported types - if type(m) not in supported_types and m.__class__.__name__ not in self.inner_supported_types: - if n in self.layer_config: - if not isinstance(m, torch.nn.Embedding): - logger.warning(f"{n} is not supported, layer_config {n}: {layer_config[n]} will be ignored.") - layer_config.pop(n) - continue - if not is_gguf: - if not check_to_quantized(layer_config[n]): - layer_config.pop(n) - continue - else: - continue - - # If the layer is not in the config and is part of a quantization block, use default configuration - if n not in layer_config.keys() and n in layers_in_blocks: - layer_config[n] = {} - for key in keys: - layer_config[n][key] = getattr(self, key) - - # If the layer is partially configured, fill in missing values - elif n in layer_config.keys(): - if "data_type" in layer_config[n] and "bits" not in layer_config[n]: - tmp_bits = infer_bits_by_data_type(layer_config[n]["data_type"]) - if tmp_bits is not None and tmp_bits != self.bits: - logger.warning( - f"'data_type' do not match the specified 'bits' setting for {n}." - f" Resetting 'bits' to {tmp_bits}." - ) - layer_config[n]["bits"] = tmp_bits - if "act_data_type" in layer_config[n] and "act_bits" not in layer_config[n]: - tmp_bits = infer_bits_by_data_type(layer_config[n]["act_data_type"]) - if tmp_bits is not None and tmp_bits != self.act_bits: - logger.warning( - f"'act_data_type' do not match the specified 'act_bits' setting for {n}." - f" Resetting 'act_bits' to {tmp_bits}." - ) - layer_config[n]["act_bits"] = tmp_bits - - for key in keys: - if key not in layer_config[n].keys(): - layer_config[n][key] = getattr(self, key) - layer_config[n]["fixed_by_user"] = True - - # If the layer is not in the config and not part of a quantization block, - # use default configuration and set specific values - else: - layer_config[n] = {} - for key in keys: - layer_config[n][key] = getattr(self, key) - layer_config[n]["bits"] = 16 - layer_config[n]["act_bits"] = 16 - - if n in layers_in_blocks: - layer_config[n]["in_blocks"] = True - else: - layer_config[n]["in_blocks"] = False - - # If the layer is outside a block and requires quantization, mark it as a quantized layer outside the block - if ( - n not in layers_in_blocks - and check_to_quantized(layer_config[n]) - and not isinstance(m, torch.nn.Embedding) - ): - has_qlayer_outside_block = True - - in_features, out_features = get_layer_features(m) - if in_features <= layer_config[n]["group_size"]: - layer_config[n]["group_size"] = -1 - - # Apply the configuration to the corresponding layer in the model - for key in keys: - setattr(m, key, layer_config[n][key]) - need_to_quantize_lm_head = self._check_need_to_quantize_lm_head_embedding() - if need_to_quantize_lm_head: - has_qlayer_outside_block = True - - # Return whether there are quantized layers outside the blocks - return has_qlayer_outside_block - @torch.no_grad() def _get_block_outputs( self, diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 97a3cdf02..f6ca0cc98 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -16,7 +16,7 @@ from dataclasses import dataclass, fields from typing import Iterable, Optional, Union -__all__ = ["QuantizationScheme", "is_gguf_scheme", "preset_name_to_scheme", "AutoScheme"] +__all__ = ["QuantizationScheme", "get_gguf_scheme", "preset_name_to_scheme", "AutoScheme"] @dataclass @@ -236,15 +236,15 @@ def is_preset_scheme(name: str) -> bool: PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value) -def is_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> bool: +def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> bool: if isinstance(scheme, str) and scheme.upper().startswith("GGUF"): return True for key, val in PRESET_SCHEMES.items(): if not key.upper().startswith("GGUF"): continue if val == scheme: - return True - return False + return key + return None @dataclass diff --git a/auto_round/utils.py b/auto_round/utils.py index 575b8e3e8..3d35d303e 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -766,8 +766,9 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): def get_layer_names_in_block( - model, supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D), quant_block_list=None, class_names=None -): + model:torch.nn.Module, supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D), + quant_block_list:list=None, class_names:tuple=None +) -> list[str]: """Retrieves the names of layers within each block of the model. Returns: @@ -778,7 +779,7 @@ def get_layer_names_in_block( class_names = [] for n, m in model.named_modules(): if type(m) in supported_types or (class_names is not None and m.__class__.__name__ in class_names): - m.tmp_name = n + m.backup_name = n layers_in_block = [] if bool(quant_block_list): all_blocks = quant_block_list @@ -788,8 +789,9 @@ def get_layer_names_in_block( for block_name in block_names: block = get_module(model, block_name) for n, m in block.named_modules(): - if hasattr(m, "tmp_name"): - layers_in_block.append(m.tmp_name) + if hasattr(m, "backup_name"): + layers_in_block.append(m.backup_name) + delattr(m, "backup_name") return layers_in_block @@ -1840,9 +1842,9 @@ def _gguf_type_fallback(gguf_type): ##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129 -def get_layer_config_by_gguf_format(layer_config, gguf_format, model, model_type=ModelType.TEXT): - # TODO: support for other format later - target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) +def get_layer_config_by_gguf_format(layer_config, target_gguf_format:str, model, model_type=ModelType.TEXT): + # # TODO: support for other format later + # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) import gguf # pylint: disable=E0401 From f027801f450d8036e4a98d09a4b6cc3826a7fb21 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Sep 2025 06:33:01 +0000 Subject: [PATCH 22/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/__main__.py | 1 - auto_round/compressors/base.py | 78 ++++++++++++++++------------------ auto_round/utils.py | 8 ++-- 3 files changed, 42 insertions(+), 45 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 43f55a050..97e3eb6ff 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -478,7 +478,6 @@ def tune(args): # layer_config[item[0]] = {} # layer_config[item[0]]["bits"] = item[1] - autoround: BaseCompressor = AutoRound( model=model_name, scheme=scheme, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 6178623f0..c5928da60 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -35,7 +35,7 @@ from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType from auto_round.logger import logger from auto_round.low_cpu_mem.utils import get_layers_before_block -from auto_round.schemes import AutoScheme, QuantizationScheme, preset_name_to_scheme, get_gguf_scheme +from auto_round.schemes import AutoScheme, QuantizationScheme, get_gguf_scheme, preset_name_to_scheme from auto_round.sign_sgd import SignSGD from auto_round.special_model_handler import _handle_moe_model from auto_round.utils import ( @@ -414,18 +414,18 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") def _prepare_layer_config( - self, - model: torch.nn.Module, - layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], - default_scheme: "QuantizationScheme", - default_scale_dtype: torch.dtype | str, - supported_types: tuple, - inner_supported_types: tuple, - quant_block_list=None, - fp_layers: str = "", - quant_lm_head: bool = False, - enable_gguf_official_mixed: bool = True, - is_mllm: bool = False, + self, + model: torch.nn.Module, + layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], + default_scheme: "QuantizationScheme", + default_scale_dtype: torch.dtype | str, + supported_types: tuple, + inner_supported_types: tuple, + quant_block_list=None, + fp_layers: str = "", + quant_lm_head: bool = False, + enable_gguf_official_mixed: bool = True, + is_mllm: bool = False, ) -> tuple[dict, bool]: """ Normalize, validate, and expand layer-specific quantization configs. @@ -452,8 +452,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str invalid = set(item) - set(scheme_keys) if invalid: raise ValueError( - f"Invalid keys {invalid} in layer_config for '{layer_name}'. " - f"Allowed keys: {scheme_keys}" + f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}" ) config = dict(item) else: @@ -472,10 +471,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str # 1. fp_layers -> force 16 for name in get_fp_layer_names(self.model, fp_layers): - layer_config[name] = { - "bits": 16, "act_bits": 16, - "data_type": "float", "act_data_type": "float" - } + layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"} # 2. normalize layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} @@ -528,17 +524,15 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str # 7. lm_head lm_head_name = get_lm_head_name(model) tied_lm_head = False - if ( - hasattr(model, "config") - and model.config.tie_word_embeddings - and hasattr(model, "_tied_weights_keys") - ): - tied_keys =model._tied_weights_keys + if hasattr(model, "config") and model.config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"): + tied_keys = model._tied_weights_keys if lm_head_name in tied_keys: - tied_lm_head=True + tied_lm_head = True if quant_lm_head and tied_lm_head: - quant_lm_head=False - logger.warning("reset `quant_lm_head` to false as quantizing lm_head with tied weights has not been supported currently") + quant_lm_head = False + logger.warning( + "reset `quant_lm_head` to false as quantizing lm_head with tied weights has not been supported currently" + ) if lm_head_name not in layer_config and quant_lm_head: layer_config[lm_head_name] = default_dict.copy() @@ -589,8 +583,6 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str dispatch_layer_config(layer_config) return layer_config, has_qlayer_outside_block - - def _parse_layer_config( self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers: str ) -> None: @@ -651,7 +643,7 @@ def _parse_layer_config( if key not in lm_head_layer_config: lm_head_layer_config[key] = getattr(self, key) - def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme: + def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme: """Parse and set the quantization scheme.""" if isinstance(scheme, QuantizationScheme): scheme = asdict(scheme) @@ -1023,7 +1015,6 @@ def remove_duplicates(lst): ) formats[i] = gguf_format_name.lower() - _gguf_args_check(self, formats, model_type=ModelType.TEXT) if self.mllm: _gguf_args_check(self, formats, model_type=ModelType.MMPROJ) @@ -1033,8 +1024,6 @@ def remove_duplicates(lst): self.scheme = preset_name_to_scheme(f) break - - for format_ in formats: if format_ not in SUPPORTED_FORMATS: logger.error(f"Unsupported format {format_}, please choose from {SUPPORTED_FORMATS}") @@ -1518,7 +1507,6 @@ def get_imatrix_hook(module, input, output): for hook in hooks: hook.remove() - def _quantize_layer_via_rtn(self, name: str) -> None: """Quantizes a layer using RTN (Round-To-Nearest) if available. @@ -1818,7 +1806,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: Returns: The quantized model and layer configurations. """ - for n, m in self.model.named_modules(): # TODO check if could removed + for n, m in self.model.named_modules(): # TODO check if could removed m.tmp_name = n self._check_compatibility() formats = self.formats if hasattr(self, "formats") else None @@ -1827,11 +1815,19 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self.model = _handle_moe_model(self.model, formats=formats) # self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config) # TODO check scale_dtype - self.layer_config, self.has_qlayer_outside_block = ( - self._prepare_layer_config(self.model, self.layer_config,self.scheme, self.scale_dtype, - self.supported_types,self.inner_supported_types,self.quant_block_list, - self.fp_layers,self.quant_lm_head, - enable_gguf_official_mixed=True,is_mllm=self.mllm)) + self.layer_config, self.has_qlayer_outside_block = self._prepare_layer_config( + self.model, + self.layer_config, + self.scheme, + self.scale_dtype, + self.supported_types, + self.inner_supported_types, + self.quant_block_list, + self.fp_layers, + self.quant_lm_head, + enable_gguf_official_mixed=True, + is_mllm=self.mllm, + ) if not hasattr(self, "formats"): logger.warning("this API is deprecated, please use `quantize_and_save` instead") diff --git a/auto_round/utils.py b/auto_round/utils.py index 3d35d303e..cac86e397 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -766,8 +766,10 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): def get_layer_names_in_block( - model:torch.nn.Module, supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D), - quant_block_list:list=None, class_names:tuple=None + model: torch.nn.Module, + supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D), + quant_block_list: list = None, + class_names: tuple = None, ) -> list[str]: """Retrieves the names of layers within each block of the model. @@ -1842,7 +1844,7 @@ def _gguf_type_fallback(gguf_type): ##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129 -def get_layer_config_by_gguf_format(layer_config, target_gguf_format:str, model, model_type=ModelType.TEXT): +def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model, model_type=ModelType.TEXT): # # TODO: support for other format later # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) From c6b78c6ad7276a2ba207d506e6857ba282b507ea Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 29 Sep 2025 14:56:09 +0800 Subject: [PATCH 23/35] tiny change --- auto_round/compressors/base.py | 41 +++++++++++----------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c5928da60..931f18197 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -413,10 +413,10 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: else: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") - def _prepare_layer_config( + def _set_layer_config( self, model: torch.nn.Module, - layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], + layer_config: dict[str, str | dict | "QuantizationScheme"], default_scheme: "QuantizationScheme", default_scale_dtype: torch.dtype | str, supported_types: tuple, @@ -523,15 +523,15 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str # 7. lm_head lm_head_name = get_lm_head_name(model) - tied_lm_head = False - if hasattr(model, "config") and model.config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"): - tied_keys = model._tied_weights_keys - if lm_head_name in tied_keys: - tied_lm_head = True - if quant_lm_head and tied_lm_head: + tie_word_embeddings = False + if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): + tie_word_embeddings = model.config.tie_word_embeddings + + if quant_lm_head and tie_word_embeddings: quant_lm_head = False logger.warning( - "reset `quant_lm_head` to false as quantizing lm_head with tied weights has not been supported currently" + "reset `quant_lm_head` to false as quantizing " + "lm_head with tied weights has not been supported currently" ) if lm_head_name not in layer_config and quant_lm_head: @@ -566,7 +566,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str return layer_config, has_qlayer_outside_block # embed + lm_head defaults for gguf - if lm_head_name not in layer_config and not tied_lm_head: + if lm_head_name not in layer_config and not tie_word_embeddings: cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} layer_config[lm_head_name] = cfg @@ -1813,9 +1813,9 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: # It is best to modify the model structure in the quantize function and check the format, # because it may cause the gguf format to not be exported normally. self.model = _handle_moe_model(self.model, formats=formats) - # self.has_qlayer_outside_block = self._set_layerwise_config(self.model, self.layer_config) + # TODO check scale_dtype - self.layer_config, self.has_qlayer_outside_block = self._prepare_layer_config( + self.layer_config, self.has_qlayer_outside_block = self._set_layer_config( self.model, self.layer_config, self.scheme, @@ -1832,21 +1832,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if not hasattr(self, "formats"): logger.warning("this API is deprecated, please use `quantize_and_save` instead") else: - only_gguf = True - for format_ in self.formats: - if not ("gguf" in format_ or "fake" in format_): - only_gguf = False - break - if len(self.formats) == 1 and self.formats[0] == "fake": - only_gguf = False - # if only_gguf: - # self.layer_config, gguf_format_config = get_layer_config_by_gguf_format( - # self.layer_config, self.formats, self.model, model_type=ModelType.TEXT - # ) - # if self.mllm: - # self.layer_config, gguf_format_config = get_layer_config_by_gguf_format( - # self.layer_config, self.formats, self.model, model_type=ModelType.MMPROJ - # ) # Determine if immediate packing is required formats = self.formats if ( @@ -1958,7 +1943,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: cost_time = end_time - self.start_time logger.info(f"quantization tuning time {cost_time}") - ## dump a summary + # Dump a summary quantized_layers = [] unquantized_layers = [] for n, m in self.model.named_modules(): From 1b9f24e8fde62ffa7f0e0e8321b69ae3c6d6479e Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 29 Sep 2025 14:57:58 +0800 Subject: [PATCH 24/35] tiny fix --- auto_round/schemes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/schemes.py b/auto_round/schemes.py index f6ca0cc98..8dde95430 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -236,7 +236,7 @@ def is_preset_scheme(name: str) -> bool: PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value) -def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> bool: +def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str: if isinstance(scheme, str) and scheme.upper().startswith("GGUF"): return True for key, val in PRESET_SCHEMES.items(): From 2c0075ae48c98d095ef68515af51f509c94af1be Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 29 Sep 2025 15:11:58 +0800 Subject: [PATCH 25/35] tmp change --- auto_round/__main__.py | 2 +- auto_round/compressors/base.py | 6 ++++-- auto_round/schemes.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 97e3eb6ff..a25fc5421 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -110,7 +110,7 @@ def __init__(self, *args, **kwargs): self.add_argument( "--scale_dtype", - default="fp16", + default=None, choices=["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"], help="scale data type to use for quantization", ) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 931f18197..281efff46 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -202,6 +202,10 @@ def __init__( ... } """ self.scheme = self._parse_and_set_scheme(scheme, kwargs) + + gguf_scheme_name = get_gguf_scheme(self.scheme) + # GGUF uses fp32 scale dtype as default + scale_dtype = kwargs.pop("scale_dtype", "fp32") if gguf_scheme_name else kwargs.pop("scale_dtype", "fp16") # Extra/legacy kwargs for backward compatibility # Major version releases may pack them with extra configuration options amp = kwargs.pop("amp", True) @@ -214,7 +218,6 @@ def __init__( sampler = kwargs.pop("sampler", "rand") not_use_best_mse = kwargs.pop("not_use_best_mse", False) dynamic_max_gap = kwargs.pop("dynamic_max_gap", -1) - scale_dtype = kwargs.pop("scale_dtype", "fp16") nblocks = kwargs.pop("nblocks", 1) low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False) to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None) @@ -287,7 +290,6 @@ def __init__( self.device_map = None self._set_device_map_in_blocks(self.device_map) - # self._parse_layer_config(layer_config, fp_layers) # Must place after model init self.to_quant_block_names = to_quant_block_names diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 8dde95430..c5513d79a 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -244,7 +244,7 @@ def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str: continue if val == scheme: return key - return None + return "" @dataclass From 97198f07b6fe660e6fba8b63c224ddb7440441d1 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 29 Sep 2025 15:40:49 +0800 Subject: [PATCH 26/35] tmp change --- auto_round/auto_schemes/utils.py | 5 ++++ auto_round/compressors/base.py | 39 +++++++++--------------------- auto_round/schemes.py | 4 +-- test/test_cuda/test_auto_scheme.py | 33 +++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 29 deletions(-) create mode 100644 auto_round/auto_schemes/utils.py create mode 100644 test/test_cuda/test_auto_scheme.py diff --git a/auto_round/auto_schemes/utils.py b/auto_round/auto_schemes/utils.py new file mode 100644 index 000000000..fdcd343e7 --- /dev/null +++ b/auto_round/auto_schemes/utils.py @@ -0,0 +1,5 @@ +def get_total_bits(model, layer_config): + pass + +def get_bits(layer): + pass diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 281efff46..8477b2ed7 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -201,7 +201,10 @@ def __init__( ... # ... ... } """ - self.scheme = self._parse_and_set_scheme(scheme, kwargs) + if isinstance(scheme, AutoScheme): #TODO AutoScheme could also be patched by group_size, etc + self.scheme = self._parse_and_set_scheme(scheme.options[0], kwargs) + else: + self.scheme = self._parse_and_set_scheme(scheme, kwargs) gguf_scheme_name = get_gguf_scheme(self.scheme) # GGUF uses fp32 scale dtype as default @@ -271,6 +274,12 @@ def __init__( self.tokenizer = tokenizer self.shared_cache_keys = get_shared_keys(self.model) + self.to_quant_block_names = to_quant_block_names + if not hasattr(self, "quant_block_list"): + all_blocks = get_block_names(model) + self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names) + + if device is not None: logger.warning("`device` is deprecated, please use `device_map` instead") @@ -290,9 +299,6 @@ def __init__( self.device_map = None self._set_device_map_in_blocks(self.device_map) - - self.to_quant_block_names = to_quant_block_names - # Set device, must place after model loading self._set_device(device_map) @@ -342,27 +348,6 @@ def __init__( if self.static_kv_dtype is not None: logger.warning("The static kv is experimental and currently has limited support.") - # Model related - self.quantized = False - if isinstance(model, str): - model, tokenizer, low_cpu_mem_usage = llm_load_model( - model, device=device, low_cpu_mem_mode=low_cpu_mem_usage - ) - elif tokenizer is None and iters > 0: - raise ValueError("A tokenizer must be set for non-str model input") - self.low_cpu_mem_usage = bool(low_cpu_mem_usage) - if unsupported_meta_device(model): - raise RuntimeError( - "AutoRound does not support parameters on meta device. " - "Please use more GPUs by setting `--device_map 0,1,2,3` or just place the model on CPU." - ) - self.model = model.eval() - self.tokenizer = tokenizer - self.shared_cache_keys = get_shared_keys(self.model) - if not hasattr(self, "quant_block_list"): - all_blocks = get_block_names(model) - self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names) - self.scale_dtype = convert_dtype_str2torch(scale_dtype) self._set_amp_dtype() self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device @@ -418,7 +403,7 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: def _set_layer_config( self, model: torch.nn.Module, - layer_config: dict[str, str | dict | "QuantizationScheme"], + layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], default_scheme: "QuantizationScheme", default_scale_dtype: torch.dtype | str, supported_types: tuple, @@ -558,7 +543,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str for cfg in layer_config.values(): if "in_blocks" not in cfg: cfg["in_blocks"] = False - # 如果 layer 不在 blocks 且需要量化,则标记存在 blocks 外的量化层 + # mark layer outside block if not cfg["in_blocks"] and check_to_quantized(cfg): has_qlayer_outside_block = True diff --git a/auto_round/schemes.py b/auto_round/schemes.py index c5513d79a..ee12607eb 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -249,7 +249,7 @@ def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str: @dataclass class AutoScheme: - options: Optional[Iterable[QuantizationScheme]] + options: Optional[Iterable[QuantizationScheme|str]] target_bits: float shared_layers: Optional[Iterable[Iterable[str]]] = None - method: str = "naive_pre" + method: str = "default" diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py new file mode 100644 index 000000000..4fd2e9c8b --- /dev/null +++ b/test/test_cuda/test_auto_scheme.py @@ -0,0 +1,33 @@ +import copy +import re +import shutil +import sys +import unittest + +sys.path.insert(0, "../..") +import torch +import transformers +from lm_eval.utils import make_table # pylint: disable=E0401 +from transformers import AutoModelForCausalLM, AutoTokenizer + +from auto_round import AutoRound, AutoRoundConfig,AutoScheme +from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model +from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051 + +class TestAutoScheme(unittest.TestCase): + @classmethod + def setUpClass(self): + self.save_dir = "./saved" + self.tasks = "lambada_openai" + + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + + def test_auto_scheme(self): + model_name = "facebook/opt-125m" + scheme = AutoScheme(target_bits=3, options=("W2A16","W4A16","BF16")) + ar = AutoRound(model_name=model_name,scheme=scheme) + ar.quantize_and_save(self.save_dir) From 27b4b4da882966b06ee750fe93e5fe4db3694bde Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Sep 2025 07:43:18 +0000 Subject: [PATCH 27/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/auto_schemes/utils.py | 16 ++++++++++++++++ auto_round/compressors/base.py | 3 +-- auto_round/schemes.py | 2 +- test/test_cuda/test_auto_scheme.py | 8 ++++---- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/auto_round/auto_schemes/utils.py b/auto_round/auto_schemes/utils.py index fdcd343e7..e01da9913 100644 --- a/auto_round/auto_schemes/utils.py +++ b/auto_round/auto_schemes/utils.py @@ -1,5 +1,21 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + def get_total_bits(model, layer_config): pass + def get_bits(layer): pass diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 8477b2ed7..f7cd773d8 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -201,7 +201,7 @@ def __init__( ... # ... ... } """ - if isinstance(scheme, AutoScheme): #TODO AutoScheme could also be patched by group_size, etc + if isinstance(scheme, AutoScheme): # TODO AutoScheme could also be patched by group_size, etc self.scheme = self._parse_and_set_scheme(scheme.options[0], kwargs) else: self.scheme = self._parse_and_set_scheme(scheme, kwargs) @@ -279,7 +279,6 @@ def __init__( all_blocks = get_block_names(model) self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names) - if device is not None: logger.warning("`device` is deprecated, please use `device_map` instead") diff --git a/auto_round/schemes.py b/auto_round/schemes.py index ee12607eb..38bed87e1 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -249,7 +249,7 @@ def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str: @dataclass class AutoScheme: - options: Optional[Iterable[QuantizationScheme|str]] + options: Optional[Iterable[QuantizationScheme | str]] target_bits: float shared_layers: Optional[Iterable[Iterable[str]]] = None method: str = "default" diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py index 4fd2e9c8b..6376e92c2 100644 --- a/test/test_cuda/test_auto_scheme.py +++ b/test/test_cuda/test_auto_scheme.py @@ -10,10 +10,11 @@ from lm_eval.utils import make_table # pylint: disable=E0401 from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig,AutoScheme +from auto_round import AutoRound, AutoRoundConfig, AutoScheme from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051 + class TestAutoScheme(unittest.TestCase): @classmethod def setUpClass(self): @@ -25,9 +26,8 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_auto_scheme(self): model_name = "facebook/opt-125m" - scheme = AutoScheme(target_bits=3, options=("W2A16","W4A16","BF16")) - ar = AutoRound(model_name=model_name,scheme=scheme) + scheme = AutoScheme(target_bits=3, options=("W2A16", "W4A16", "BF16")) + ar = AutoRound(model_name=model_name, scheme=scheme) ar.quantize_and_save(self.save_dir) From 2d3095a05368e3c082861df49d0cff4c0b7855c2 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 29 Sep 2025 16:36:52 +0800 Subject: [PATCH 28/35] update --- auto_round/auto_schemes/gen_scheme.py | 19 ++ auto_round/compressors/base.py | 261 +++----------------------- auto_round/schemes.py | 2 +- auto_round/utils.py | 177 ++++++++++++++++- test/test_cuda/test_auto_scheme.py | 11 +- 5 files changed, 224 insertions(+), 246 deletions(-) create mode 100644 auto_round/auto_schemes/gen_scheme.py diff --git a/auto_round/auto_schemes/gen_scheme.py b/auto_round/auto_schemes/gen_scheme.py new file mode 100644 index 000000000..badf39742 --- /dev/null +++ b/auto_round/auto_schemes/gen_scheme.py @@ -0,0 +1,19 @@ +from typing import Union, Iterable + +import torch + +from auto_round import AutoScheme + + +class GenScheme: + def __init__(self, + auto_scheme: AutoScheme, + model: torch.nn.Module, + quant_layer_names: Iterable[str], + fixed_layer_scheme:dict[str, dict], + scale_dtype: str = "fp16", + dataset="pile-10k" + ): + pass + + diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index f7cd773d8..495cf3d04 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -93,7 +93,7 @@ set_module, to_device, to_dtype, - unsupported_meta_device, + unsupported_meta_device, set_layer_config, ) from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block @@ -236,6 +236,9 @@ def __init__( self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) self.fp_layers = kwargs.pop("fp_layers", "") self.layer_config = layer_config + self.supported_types = SUPPORTED_LAYER_TYPES + self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES + self.scale_dtype = convert_dtype_str2torch(scale_dtype) if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") @@ -285,6 +288,28 @@ def __init__( if device_map is None: device_map = 0 + if isinstance(scheme, AutoScheme): + if self.mllm: + logger.info("AutoScheme with MLLM is not supported yet.") + sys.exit(1) + layer_config,_ = set_layer_config(self.model, + self.layer_config, + self.scheme, + self.scale_dtype, + self.supported_types, + self.inner_supported_types, + self.quant_block_list, + self.fp_layers, + self.quant_lm_head, + enable_gguf_official_mixed=False, + is_mllm=self.mllm) + quant_layer_names = layer_config.keys() + fixed_layer_scheme = {k: v for k, v in layer_config.items() if v.get("fixed_by_user", False)} + # mainly using quant_layers and fixed by users + from auto_round.auto_schemes.gen_scheme import GenScheme + gen_scheme = GenScheme(scheme,self.model,quant_layer_names,fixed_layer_scheme, self.scale_dtype, self.dataset) + + # Set device, must place after model loading self._set_device(device_map) @@ -347,7 +372,6 @@ def __init__( if self.static_kv_dtype is not None: logger.warning("The static kv is experimental and currently has limited support.") - self.scale_dtype = convert_dtype_str2torch(scale_dtype) self._set_amp_dtype() self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device if self.act_bits <= 8 and self.amp_dtype == torch.float16: @@ -359,8 +383,6 @@ def __init__( logger.info(f"using {self.model.dtype} for quantization tuning") # Some helpers - self.supported_types = SUPPORTED_LAYER_TYPES - self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES if "hpu" in str(self.device): self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear") self.batch_dim = None @@ -399,235 +421,6 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: else: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") - def _set_layer_config( - self, - model: torch.nn.Module, - layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], - default_scheme: "QuantizationScheme", - default_scale_dtype: torch.dtype | str, - supported_types: tuple, - inner_supported_types: tuple, - quant_block_list=None, - fp_layers: str = "", - quant_lm_head: bool = False, - enable_gguf_official_mixed: bool = True, - is_mllm: bool = False, - ) -> tuple[dict, bool]: - """ - Normalize, validate, and expand layer-specific quantization configs. - Returns (final_layer_config, has_quant_layer_outside_block) - """ - - from auto_round.schemes import get_gguf_scheme - - # ---- helpers ------------------------------------------------- - def dispatch_layer_config(layer_config: dict[str, dict]) -> None: - """Assign scheme values as attributes to matched modules.""" - for layer_name, scheme in layer_config.items(): - module = get_module(model, layer_name) - for attr, value in scheme.items(): - setattr(module, attr, value) - - def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict: - """Convert config entry into dict and validate keys.""" - if isinstance(item, str): - config = asdict(preset_name_to_scheme(item.upper())) - elif isinstance(item, QuantizationScheme): - config = asdict(item) - elif isinstance(item, dict): - invalid = set(item) - set(scheme_keys) - if invalid: - raise ValueError( - f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}" - ) - config = dict(item) - else: - raise TypeError( - f"Unsupported type for layer_config[{layer_name}]: {type(item)}. " - f"Expected str, dict, or QuantizationScheme." - ) - # Clean up - config = {k: v for k, v in config.items() if v is not None} - config["fixed_by_user"] = True - return config - - # ---- main logic ---------------------------------------------- - scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) - layer_config = copy.deepcopy(layer_config) or {} - - # 1. fp_layers -> force 16 - for name in get_fp_layer_names(self.model, fp_layers): - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"} - - # 2. normalize - layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} - - # 3. infer missing bits - for cfg in layer_config.values(): - if "data_type" in cfg and "bits" not in cfg: - if (b := infer_bits_by_data_type(cfg["data_type"])) is not None: - cfg["bits"] = b - if "act_data_type" in cfg and "act_bits" not in cfg: - if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None: - cfg["act_bits"] = b - - # 4. fill defaults - default_dict = asdict(default_scheme) - default_dict["scale_dtype"] = default_scale_dtype - for cfg in layer_config.values(): - for key in scheme_keys: - cfg.setdefault(key, default_dict.get(key)) - - # 5. collect supported modules - gguf_name = get_gguf_scheme(default_scheme) - if gguf_name and torch.nn.Embedding not in supported_types: - supported_types = (*supported_types, torch.nn.Embedding) - - all_layer_names, embedding_layer_names = [], [] - for n, m in model.named_modules(): - # cleanup stale attributes - for key in scheme_keys: - if hasattr(m, key): - delattr(m, key) - if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types: - continue - all_layer_names.append(n) - if isinstance(m, torch.nn.Embedding): - embedding_layer_names.append(n) - - # 6. expand regex configs - for name in list(layer_config.keys()): - if name in all_layer_names: - continue - regex = re.compile(name) - matched = [ln for ln in all_layer_names if regex.search(ln)] - if not matched: - raise ValueError(f"Invalid regex '{name}' in layer_config, no match found.") - val = layer_config.pop(name) - for match in matched: - layer_config[match] = val - - # 7. lm_head - lm_head_name = get_lm_head_name(model) - tie_word_embeddings = False - if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): - tie_word_embeddings = model.config.tie_word_embeddings - - if quant_lm_head and tie_word_embeddings: - quant_lm_head = False - logger.warning( - "reset `quant_lm_head` to false as quantizing " - "lm_head with tied weights has not been supported currently" - ) - - if lm_head_name not in layer_config and quant_lm_head: - layer_config[lm_head_name] = default_dict.copy() - - # 8. enforce shape divisibility for int weight-only - if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name: - for n, m in model.named_modules(): - if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: - if m.weight.shape[0] % 32 or m.weight.shape[1] % 32: - layer_config.setdefault(n, default_dict.copy()) - layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True}) - logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).") - - # 9. block layers: mark as in_blocks=True - for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): - cfg = layer_config.setdefault(name, default_dict.copy()) - cfg["in_blocks"] = True - - # ---- restore: ensure missing in_blocks are set to False and compute flag ---- - has_qlayer_outside_block = False - for cfg in layer_config.values(): - if "in_blocks" not in cfg: - cfg["in_blocks"] = False - # mark layer outside block - if not cfg["in_blocks"] and check_to_quantized(cfg): - has_qlayer_outside_block = True - - # 10. GGUF handling - if not gguf_name: - dispatch_layer_config(layer_config) - return layer_config, has_qlayer_outside_block - - # embed + lm_head defaults for gguf - if lm_head_name not in layer_config and not tie_word_embeddings: - cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] - cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} - layer_config[lm_head_name] = cfg - has_qlayer_outside_block = True - for emd_name in embedding_layer_names: - cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]] - cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} - layer_config[emd_name] = cfg - - if enable_gguf_official_mixed: - model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT - layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type) - - dispatch_layer_config(layer_config) - return layer_config, has_qlayer_outside_block - - def _parse_layer_config( - self, layer_config: dict[str, Union[str, dict, QuantizationScheme]], fp_layers: str - ) -> None: - """Parse and set the layer-wise quantization configuration.""" - not_quantize_layer_names = get_fp_layer_names(self.model, fp_layers) - if len(not_quantize_layer_names) > 0: - logger.info(f"{not_quantize_layer_names} will not be quantized.") - if layer_config is None: - layer_config = {} - for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"} - - # Some other quantization configs - self.layer_config = copy.deepcopy(layer_config) if layer_config is not None else {} - scheme_keys = {f.name for f in fields(QuantizationScheme)} - - for key, item in self.layer_config.items(): - if isinstance(item, str): - config = asdict(preset_name_to_scheme(item.upper())) - elif isinstance(item, QuantizationScheme): - config = asdict(item) - elif isinstance(item, dict): - invalid_keys = set(item) - scheme_keys - if invalid_keys: - raise ValueError( - f"Invalid keys {invalid_keys} in layer_config for layer '{key}', " - f"only {scheme_keys} are supported" - ) - config = dict(item) - - # Drop None values - config = {k: v for k, v in config.items() if v is not None} - self.layer_config[key] = config - - if not self.quant_lm_head or (isinstance(self.scheme, str) and self.scheme.lower().startswith("gguf")): - return - for n, _ in self.model.named_modules(): - lm_head_layer_name = n - - if ( - hasattr(self.model, "config") - and self.model.config.tie_word_embeddings - and hasattr(self.model, "_tied_weights_keys") - ): - tied_keys = self.model._tied_weights_keys - for item in tied_keys: - if lm_head_layer_name in item: # TODO extend to encoder-decoder layer, seq classification model - self.quant_lm_head = False - logger.warning( - "reset `quant_lm_head` to `False` as quantizing lm_head with tied weights has not been " - "supported currently" - ) - break - - lm_head_layer_config = self.layer_config[lm_head_layer_name] if lm_head_layer_name in self.layer_config else {} - - for key in scheme_keys: - if key not in lm_head_layer_config: - lm_head_layer_config[key] = getattr(self, key) def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme: """Parse and set the quantization scheme.""" @@ -1801,7 +1594,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self.model = _handle_moe_model(self.model, formats=formats) # TODO check scale_dtype - self.layer_config, self.has_qlayer_outside_block = self._set_layer_config( + self.layer_config, self.has_qlayer_outside_block = set_layer_config( self.model, self.layer_config, self.scheme, diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 38bed87e1..cf7d4d433 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -250,6 +250,6 @@ def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str: @dataclass class AutoScheme: options: Optional[Iterable[QuantizationScheme | str]] - target_bits: float + avg_bits: float shared_layers: Optional[Iterable[Iterable[str]]] = None method: str = "default" diff --git a/auto_round/utils.py b/auto_round/utils.py index cac86e397..92e54a3ba 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -21,7 +21,7 @@ import re import sys from collections import UserDict -from dataclasses import fields +from dataclasses import fields, asdict from enum import Enum from functools import lru_cache from pathlib import Path @@ -35,7 +35,7 @@ from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme +from auto_round.schemes import QuantizationScheme, preset_name_to_scheme SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") @@ -2742,3 +2742,176 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): return True return False + + + +def set_layer_config( + model: torch.nn.Module, + layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], + default_scheme: "QuantizationScheme", + default_scale_dtype: torch.dtype | str, + supported_types: tuple, + inner_supported_types: tuple, + quant_block_list=None, + fp_layers: str = "", + quant_lm_head: bool = False, + enable_gguf_official_mixed: bool = True, + is_mllm: bool = False, +) -> tuple[dict, bool]: + """ + Normalize, validate, and expand layer-specific quantization configs. + Returns (final_layer_config, has_quant_layer_outside_block) + """ + + from auto_round.schemes import get_gguf_scheme + + # ---- helpers ------------------------------------------------- + def dispatch_layer_config(layer_config: dict[str, dict]) -> None: + """Assign scheme values as attributes to matched modules.""" + for layer_name, scheme in layer_config.items(): + module = get_module(model, layer_name) + for attr, value in scheme.items(): + setattr(module, attr, value) + + def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict: + """Convert config entry into dict and validate keys.""" + if isinstance(item, str): + config = asdict(preset_name_to_scheme(item.upper())) + elif isinstance(item, QuantizationScheme): + config = asdict(item) + elif isinstance(item, dict): + invalid = set(item) - set(scheme_keys) + if invalid: + raise ValueError( + f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}" + ) + config = dict(item) + else: + raise TypeError( + f"Unsupported type for layer_config[{layer_name}]: {type(item)}. " + f"Expected str, dict, or QuantizationScheme." + ) + # Clean up + config = {k: v for k, v in config.items() if v is not None} + config["fixed_by_user"] = True + return config + + # ---- main logic ---------------------------------------------- + scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) + layer_config = copy.deepcopy(layer_config) or {} + + # 1. fp_layers -> force 16 + for name in get_fp_layer_names(model, fp_layers): + layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float","fixed_by_user":True} + + # 2. normalize + layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} + + # 3. infer missing bits + for cfg in layer_config.values(): + if "data_type" in cfg and "bits" not in cfg: + if (b := infer_bits_by_data_type(cfg["data_type"])) is not None: + cfg["bits"] = b + if "act_data_type" in cfg and "act_bits" not in cfg: + if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None: + cfg["act_bits"] = b + + # 4. fill defaults + default_dict = asdict(default_scheme) + default_dict["scale_dtype"] = default_scale_dtype + for cfg in layer_config.values(): + for key in scheme_keys: + cfg.setdefault(key, default_dict.get(key)) + + # 5. collect supported modules + gguf_name = get_gguf_scheme(default_scheme) + if gguf_name and torch.nn.Embedding not in supported_types: + supported_types = (*supported_types, torch.nn.Embedding) + + all_layer_names, embedding_layer_names = [], [] + for n, m in model.named_modules(): + # cleanup stale attributes + for key in scheme_keys: + if hasattr(m, key): + delattr(m, key) + if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types: + continue + all_layer_names.append(n) + if isinstance(m, torch.nn.Embedding): + embedding_layer_names.append(n) + + # 6. expand regex configs + for name in list(layer_config.keys()): + if name in all_layer_names: + continue + regex = re.compile(name) + matched = [ln for ln in all_layer_names if regex.search(ln)] + if not matched: + raise ValueError(f"Invalid '{name}' in layer_config, no match found.") + val = layer_config.pop(name) + for match in matched: + layer_config[match] = val + + # 7. lm_head + lm_head_name = get_lm_head_name(model) + tie_word_embeddings = False + if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): + tie_word_embeddings = model.config.tie_word_embeddings + + if quant_lm_head and tie_word_embeddings: + quant_lm_head = False + logger.warning( + "reset `quant_lm_head` to false as quantizing " + "lm_head with tied weights has not been supported currently" + ) + + if lm_head_name not in layer_config and quant_lm_head: + layer_config[lm_head_name] = default_dict.copy() + + # 8. enforce shape divisibility for int weight-only + if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name: + for n, m in model.named_modules(): + if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: + if m.weight.shape[0] % 32 or m.weight.shape[1] % 32: + layer_config.setdefault(n, default_dict.copy()) + layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True}) + logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).") + + # 9. block layers: mark as in_blocks=True + for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): + if name not in layer_config: + layer_config[name] = default_dict.copy() + layer_config[name]["fixed_by_user"]=False + layer_config[name]["in_blocks"] = True + + # ---- restore: ensure missing in_blocks are set to False and compute flag ---- + has_qlayer_outside_block = False + for cfg in layer_config.values(): + if "in_blocks" not in cfg: + cfg["in_blocks"] = False + # mark layer outside block + if not cfg["in_blocks"] and check_to_quantized(cfg): + has_qlayer_outside_block = True + + # 10. GGUF handling + if not gguf_name: + dispatch_layer_config(layer_config) + return layer_config, has_qlayer_outside_block + + # embed + lm_head defaults for gguf + if lm_head_name not in layer_config and not tie_word_embeddings: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] + cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} + layer_config[lm_head_name] = cfg + has_qlayer_outside_block = True + for emd_name in embedding_layer_names: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]] + cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} + layer_config[emd_name] = cfg + + if enable_gguf_official_mixed: + model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT + layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type) + + dispatch_layer_config(layer_config) + return layer_config, has_qlayer_outside_block diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py index 6376e92c2..b4f5e6041 100644 --- a/test/test_cuda/test_auto_scheme.py +++ b/test/test_cuda/test_auto_scheme.py @@ -5,15 +5,8 @@ import unittest sys.path.insert(0, "../..") -import torch -import transformers -from lm_eval.utils import make_table # pylint: disable=E0401 -from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig, AutoScheme -from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model -from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051 - class TestAutoScheme(unittest.TestCase): @classmethod @@ -28,6 +21,6 @@ def tearDownClass(self): def test_auto_scheme(self): model_name = "facebook/opt-125m" - scheme = AutoScheme(target_bits=3, options=("W2A16", "W4A16", "BF16")) - ar = AutoRound(model_name=model_name, scheme=scheme) + scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16")) + ar = AutoRound(model=model_name, scheme=scheme, iters=1, nsamples=1) ar.quantize_and_save(self.save_dir) From 35a298b0f30c57df5e5af1808d3330538371c237 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Sep 2025 08:37:38 +0000 Subject: [PATCH 29/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/auto_schemes/gen_scheme.py | 35 ++++++++++++++++++--------- auto_round/compressors/base.py | 32 +++++++++++++----------- auto_round/utils.py | 16 +++++++----- test/test_cuda/test_auto_scheme.py | 1 + 4 files changed, 53 insertions(+), 31 deletions(-) diff --git a/auto_round/auto_schemes/gen_scheme.py b/auto_round/auto_schemes/gen_scheme.py index badf39742..ba6b0a679 100644 --- a/auto_round/auto_schemes/gen_scheme.py +++ b/auto_round/auto_schemes/gen_scheme.py @@ -1,4 +1,18 @@ -from typing import Union, Iterable +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterable, Union import torch @@ -6,14 +20,13 @@ class GenScheme: - def __init__(self, - auto_scheme: AutoScheme, - model: torch.nn.Module, - quant_layer_names: Iterable[str], - fixed_layer_scheme:dict[str, dict], - scale_dtype: str = "fp16", - dataset="pile-10k" - ): + def __init__( + self, + auto_scheme: AutoScheme, + model: torch.nn.Module, + quant_layer_names: Iterable[str], + fixed_layer_scheme: dict[str, dict], + scale_dtype: str = "fp16", + dataset="pile-10k", + ): pass - - diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 495cf3d04..66c79274f 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -90,10 +90,11 @@ mv_module_from_gpu, reset_params, set_amax_for_all_moe_layers, + set_layer_config, set_module, to_device, to_dtype, - unsupported_meta_device, set_layer_config, + unsupported_meta_device, ) from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block @@ -292,23 +293,27 @@ def __init__( if self.mllm: logger.info("AutoScheme with MLLM is not supported yet.") sys.exit(1) - layer_config,_ = set_layer_config(self.model, - self.layer_config, - self.scheme, - self.scale_dtype, - self.supported_types, - self.inner_supported_types, - self.quant_block_list, - self.fp_layers, - self.quant_lm_head, - enable_gguf_official_mixed=False, - is_mllm=self.mllm) + layer_config, _ = set_layer_config( + self.model, + self.layer_config, + self.scheme, + self.scale_dtype, + self.supported_types, + self.inner_supported_types, + self.quant_block_list, + self.fp_layers, + self.quant_lm_head, + enable_gguf_official_mixed=False, + is_mllm=self.mllm, + ) quant_layer_names = layer_config.keys() fixed_layer_scheme = {k: v for k, v in layer_config.items() if v.get("fixed_by_user", False)} # mainly using quant_layers and fixed by users from auto_round.auto_schemes.gen_scheme import GenScheme - gen_scheme = GenScheme(scheme,self.model,quant_layer_names,fixed_layer_scheme, self.scale_dtype, self.dataset) + gen_scheme = GenScheme( + scheme, self.model, quant_layer_names, fixed_layer_scheme, self.scale_dtype, self.dataset + ) # Set device, must place after model loading self._set_device(device_map) @@ -421,7 +426,6 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: else: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") - def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme: """Parse and set the quantization scheme.""" if isinstance(scheme, QuantizationScheme): diff --git a/auto_round/utils.py b/auto_round/utils.py index 92e54a3ba..e865726b8 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -21,7 +21,7 @@ import re import sys from collections import UserDict -from dataclasses import fields, asdict +from dataclasses import asdict, fields from enum import Enum from functools import lru_cache from pathlib import Path @@ -2744,7 +2744,6 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): return False - def set_layer_config( model: torch.nn.Module, layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], @@ -2802,7 +2801,13 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str # 1. fp_layers -> force 16 for name in get_fp_layer_names(model, fp_layers): - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float","fixed_by_user":True} + layer_config[name] = { + "bits": 16, + "act_bits": 16, + "data_type": "float", + "act_data_type": "float", + "fixed_by_user": True, + } # 2. normalize layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} @@ -2861,8 +2866,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str if quant_lm_head and tie_word_embeddings: quant_lm_head = False logger.warning( - "reset `quant_lm_head` to false as quantizing " - "lm_head with tied weights has not been supported currently" + "reset `quant_lm_head` to false as quantizing " "lm_head with tied weights has not been supported currently" ) if lm_head_name not in layer_config and quant_lm_head: @@ -2881,7 +2885,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): if name not in layer_config: layer_config[name] = default_dict.copy() - layer_config[name]["fixed_by_user"]=False + layer_config[name]["fixed_by_user"] = False layer_config[name]["in_blocks"] = True # ---- restore: ensure missing in_blocks are set to False and compute flag ---- diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py index b4f5e6041..b9fffdee9 100644 --- a/test/test_cuda/test_auto_scheme.py +++ b/test/test_cuda/test_auto_scheme.py @@ -8,6 +8,7 @@ from auto_round import AutoRound, AutoRoundConfig, AutoScheme + class TestAutoScheme(unittest.TestCase): @classmethod def setUpClass(self): From 4a594cd5e778b2c5897e3131d72e21c7e46ba74f Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 29 Sep 2025 20:37:42 +0800 Subject: [PATCH 30/35] fix --- auto_round/compressors/base.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 66c79274f..955971306 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -725,20 +725,20 @@ def _check_compatibility(self) -> None: " We are likely to release new algorithm for certain configurations in the future." ) - # Check group_size 32 for auto_round - if ( - self.data_type == "int" - and hasattr(self, "formats") - and any(key in fmt for fmt in self.formats for key in ("auto_round", "auto_gptq", "auto_awq")) - ): - for n, m in self.model.named_modules(): - if type(m) in self.supported_types: - if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: - self.layer_config[n] = {"bits": 16} - logger.info( - f"{n} will not be quantized due to its shape not being divisible by 32," - " resulting in an exporting issue to autogptq" - ) + # # Check group_size 32 for auto_round + # if ( + # self.data_type == "int" + # and hasattr(self, "formats") + # and any(key in fmt for fmt in self.formats for key in ("auto_round", "auto_gptq", "auto_awq")) + # ): + # for n, m in self.model.named_modules(): + # if type(m) in self.supported_types: + # if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: + # self.layer_config[n] = {"bits": 16} + # logger.info( + # f"{n} will not be quantized due to its shape not being divisible by 32," + # " resulting in an exporting issue to autogptq" + # ) if ( self.seqlen is not None From dcd08d629cc3840efdc24b8c9af97af2edf71095 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 30 Sep 2025 14:02:23 +0800 Subject: [PATCH 31/35] fix uts, still one left --- .../export/export_to_autoround/export_to_nvfp_mxfp.py | 4 ++-- auto_round/schemes.py | 9 +++++++-- auto_round/utils.py | 5 +++++ test/test_cpu/test_autoround.py | 1 + 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py index c4a02f673..240a94899 100644 --- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py +++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py @@ -174,7 +174,7 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs): for n, m in model.named_modules(): if type(m) in SUPPORTED_LAYER_TYPES: layer = m - if layer.act_bits < 8 and not getattr(layer, "input_global_scale", None): + if hasattr(layer,"act_bits") and layer.act_bits < 8 and not getattr(layer, "input_global_scale", None): assert hasattr(layer, "act_max") from auto_round.data_type.nvfp import calculate_gparam @@ -198,7 +198,7 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs): for layer_name in layer_config: if ( not layer_config[layer_name]["in_blocks"] and layer_config[layer_name]["bits"] <= 8 - ): ##lm head ##TODO fix act and so on + ): ##lm head # TODO fix act and so on extra_config[layer_name] = {} extra_config[layer_name]["bits"] = layer_config[layer_name]["bits"] extra_config[layer_name]["data_type"] = layer_config[layer_name]["data_type"] diff --git a/auto_round/schemes.py b/auto_round/schemes.py index cf7d4d433..32be2fb52 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -238,11 +238,16 @@ def is_preset_scheme(name: str) -> bool: def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str: if isinstance(scheme, str) and scheme.upper().startswith("GGUF"): - return True + return scheme for key, val in PRESET_SCHEMES.items(): if not key.upper().startswith("GGUF"): continue - if val == scheme: + equal = True + for scheme_key in val.keys(): + if val[scheme_key] is not None and val[scheme_key] != scheme.get(scheme_key, None): + equal = False + break + if equal: return key return "" diff --git a/auto_round/utils.py b/auto_round/utils.py index e865726b8..009d516d8 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2834,7 +2834,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str supported_types = (*supported_types, torch.nn.Embedding) all_layer_names, embedding_layer_names = [], [] + all_module_names = [] for n, m in model.named_modules(): + all_module_names.append(n) # cleanup stale attributes for key in scheme_keys: if hasattr(m, key): @@ -2849,6 +2851,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str for name in list(layer_config.keys()): if name in all_layer_names: continue + if name in all_module_names: + logger.warning_once(f"the type of `{name}` is not supported in your scheme, ignore it for now.") + continue regex = re.compile(name) matched = [ln for ln in all_layer_names if regex.search(ln)] if not matched: diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index 9511f0cf8..aac524800 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -720,6 +720,7 @@ def test_invalid_layer_config(self): iters=1, layer_config=layer_config, ) + ar.quantize() def test_quant_lm_head(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B" From 91722646068c4adcdbfefe058e3f486b58793a6f Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 30 Sep 2025 15:19:25 +0800 Subject: [PATCH 32/35] fix gguf issue --- auto_round/compressors/base.py | 6 ++++-- auto_round/schemes.py | 2 ++ auto_round/utils.py | 37 ++++++++++++++++++++++++++++++---- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 955971306..0731d0ba8 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -428,11 +428,13 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme: """Parse and set the quantization scheme.""" + res= "" if isinstance(scheme, QuantizationScheme): scheme = asdict(scheme) elif isinstance(scheme, dict): scheme = scheme elif isinstance(scheme, str): + res = scheme # gguf:q4_k_s and gguf_q4_k_m has the same dict scheme, but the result is different scheme = scheme.upper() scheme = asdict(preset_name_to_scheme(scheme)) scheme_keys = [f.name for f in fields(QuantizationScheme)] @@ -481,7 +483,7 @@ def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kw break for key in scheme_keys: scheme[key] = getattr(self, key) - return QuantizationScheme.from_dict(scheme) + return res if res else QuantizationScheme.from_dict(scheme) def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: """Sets the torch compile configuration for the tuning.""" @@ -804,7 +806,7 @@ def remove_duplicates(lst): for f in formats: if f.startswith("gguf"): - self.scheme = preset_name_to_scheme(f) + self.scheme = f.upper() break for format_ in formats: diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 32be2fb52..cde37a0c9 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -239,6 +239,8 @@ def is_preset_scheme(name: str) -> bool: def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str: if isinstance(scheme, str) and scheme.upper().startswith("GGUF"): return scheme + if isinstance(scheme, str): + return "" for key, val in PRESET_SCHEMES.items(): if not key.upper().startswith("GGUF"): continue diff --git a/auto_round/utils.py b/auto_round/utils.py index 009d516d8..84390fa43 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -35,7 +35,7 @@ from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme, preset_name_to_scheme +from auto_round.schemes import QuantizationScheme, preset_name_to_scheme, get_gguf_scheme SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") @@ -1940,6 +1940,30 @@ def _set_config(config, target_config): ) new_type = new_type[:bits_index] + target_bits + new_type[bits_index + 1 :] else: + config_tmp = config.copy() + scheme_keys = [f.name for f in fields(QuantizationScheme)] + for key in config.keys(): + if key not in scheme_keys: + config_tmp.pop(key, None) + matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched + if not matched_scheme: + if config.get("super_group_size", None) is not None: + new_type = new_type[:bits_index] + str(config["bits"]) + "_k" + if config.get("super_group_size", None) is None or new_type not in GGUF_INNER_CONFIG: + if config.get("sym", True): + new_type = new_type[:bits_index] + str(config["bits"]) + "_0" + if new_type not in GGUF_INNER_CONFIG: + new_type = new_type[:bits_index] + str(config["bits"]) + "_1" + if not config.get("sym", True): + new_type = new_type[:bits_index] + str(config["bits"]) + "_1" + if new_type not in GGUF_INNER_CONFIG: + new_type = new_type[:bits_index] + str(config["bits"]) + "_0" + if new_type not in GGUF_INNER_CONFIG: + raise ValueError(f"the setting in layer_config {layer_name} " + f"could not match any supported gguf format, please have a check.") + else: + logger.warning_once(f"the setting in layer_config {layer_name} " + f"could not match any supported gguf format, reset to {new_type}") new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] new_type = _search_gguf_type(new_type) if new_type is None: @@ -2747,7 +2771,7 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): def set_layer_config( model: torch.nn.Module, layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], - default_scheme: "QuantizationScheme", + default_scheme: Union[str, "QuantizationScheme"], default_scale_dtype: torch.dtype | str, supported_types: tuple, inner_supported_types: tuple, @@ -2822,11 +2846,14 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str cfg["act_bits"] = b # 4. fill defaults - default_dict = asdict(default_scheme) + if isinstance(default_scheme,str): + default_dict = asdict(preset_name_to_scheme(default_scheme.upper())) + else: + default_dict = asdict(default_scheme) default_dict["scale_dtype"] = default_scale_dtype for cfg in layer_config.values(): for key in scheme_keys: - cfg.setdefault(key, default_dict.get(key)) + cfg.setdefault(key, default_dict.copy().get(key)) # 5. collect supported modules gguf_name = get_gguf_scheme(default_scheme) @@ -2914,6 +2941,8 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str layer_config[lm_head_name] = cfg has_qlayer_outside_block = True for emd_name in embedding_layer_names: + if emd_name in layer_config: + continue cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]] cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} layer_config[emd_name] = cfg From f98092c6e6b4e53e9d4653ec21fe4e7fa69a0ee3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 07:25:15 +0000 Subject: [PATCH 33/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 5 ++--- .../export_to_autoround/export_to_nvfp_mxfp.py | 2 +- auto_round/utils.py | 18 +++++++++++------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index f957b2a5c..f1036ecac 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -328,7 +328,6 @@ def __init__( self.device_map = None self._set_device_map_in_blocks(self.device_map) - # Tuning hyperparameters self.seed = seed set_seed(self.seed) @@ -416,13 +415,13 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme: """Parse and set the quantization scheme.""" - res= "" + res = "" if isinstance(scheme, QuantizationScheme): scheme = asdict(scheme) elif isinstance(scheme, dict): scheme = scheme elif isinstance(scheme, str): - res = scheme # gguf:q4_k_s and gguf_q4_k_m has the same dict scheme, but the result is different + res = scheme # gguf:q4_k_s and gguf_q4_k_m has the same dict scheme, but the result is different scheme = scheme.upper() scheme = asdict(preset_name_to_scheme(scheme)) scheme_keys = [f.name for f in fields(QuantizationScheme)] diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py index 240a94899..eaf3ad9ae 100644 --- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py +++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py @@ -174,7 +174,7 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs): for n, m in model.named_modules(): if type(m) in SUPPORTED_LAYER_TYPES: layer = m - if hasattr(layer,"act_bits") and layer.act_bits < 8 and not getattr(layer, "input_global_scale", None): + if hasattr(layer, "act_bits") and layer.act_bits < 8 and not getattr(layer, "input_global_scale", None): assert hasattr(layer, "act_max") from auto_round.data_type.nvfp import calculate_gparam diff --git a/auto_round/utils.py b/auto_round/utils.py index 187fc883d..a1c411373 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -35,7 +35,7 @@ from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme, preset_name_to_scheme, get_gguf_scheme +from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") @@ -1949,7 +1949,7 @@ def _set_config(config, target_config): for key in config.keys(): if key not in scheme_keys: config_tmp.pop(key, None) - matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched + matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched if not matched_scheme: if config.get("super_group_size", None) is not None: new_type = new_type[:bits_index] + str(config["bits"]) + "_k" @@ -1963,11 +1963,15 @@ def _set_config(config, target_config): if new_type not in GGUF_INNER_CONFIG: new_type = new_type[:bits_index] + str(config["bits"]) + "_0" if new_type not in GGUF_INNER_CONFIG: - raise ValueError(f"the setting in layer_config {layer_name} " - f"could not match any supported gguf format, please have a check.") + raise ValueError( + f"the setting in layer_config {layer_name} " + f"could not match any supported gguf format, please have a check." + ) else: - logger.warning_once(f"the setting in layer_config {layer_name} " - f"could not match any supported gguf format, reset to {new_type}") + logger.warning_once( + f"the setting in layer_config {layer_name} " + f"could not match any supported gguf format, reset to {new_type}" + ) new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] new_type = _search_gguf_type(new_type) if new_type is None: @@ -2850,7 +2854,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str cfg["act_bits"] = b # 4. fill defaults - if isinstance(default_scheme,str): + if isinstance(default_scheme, str): default_dict = asdict(preset_name_to_scheme(default_scheme.upper())) else: default_dict = asdict(default_scheme) From 033d1f6ed3e4b4a128a07c964af01b156c820704 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 30 Sep 2025 16:48:32 +0800 Subject: [PATCH 34/35] update a little --- auto_round/auto_schemes/gen_scheme.py | 66 ++++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/auto_round/auto_schemes/gen_scheme.py b/auto_round/auto_schemes/gen_scheme.py index ba6b0a679..03c253a6a 100644 --- a/auto_round/auto_schemes/gen_scheme.py +++ b/auto_round/auto_schemes/gen_scheme.py @@ -17,16 +17,68 @@ import torch from auto_round import AutoScheme +from auto_round.utils import get_layer_features class GenScheme: def __init__( - self, - auto_scheme: AutoScheme, - model: torch.nn.Module, - quant_layer_names: Iterable[str], - fixed_layer_scheme: dict[str, dict], - scale_dtype: str = "fp16", - dataset="pile-10k", + self, + auto_scheme: AutoScheme, + model: torch.nn.Module, + quant_layer_names: Iterable[str], + fixed_layer_scheme: dict[str, dict], + scale_dtype: str = "fp16", + dataset="pile-10k", ): + self.auto_scheme = auto_scheme + self.model = model + self.quant_layer_names = quant_layer_names + self.fixed_layer_scheme = fixed_layer_scheme + self.scale_dtype = scale_dtype + self.dataset = dataset + + def _get_min_max_avg_bits(self) -> tuple[float, float]: pass + + # not validate yet + def get_layer_bits(self, layer): + weight = layer.weight + n_param = weight.numel() + weight_bits = getattr(layer, 'bits', 16) + group_size = getattr(layer, 'group_size', 128) + super_group_size = getattr(layer, 'super_group_size', None) + super_weight_bits = getattr(layer, 'super_bits', None) + + # Main quantization cost + weight_total_bits = weight_bits * n_param + if weight_bits>=16: # Unquantized layer + return weight_total_bits, 16 + + in_features, output_features = get_layer_features(layer) + # Determine number of groups + if group_size > 0: # group-wise + n_group = output_features * (in_features + group_size - 1) // group_size + elif group_size == 0: # per-tensor + n_group = 1 + elif group_size == -1: # per-channel + n_group = output_features # out_channels + else: + raise ValueError(f"Invalid group_size {group_size}") + aux_total_bits = 0 + if not super_group_size: + # Scale and zero point bitwidths + scale_bits = 16 + zp_bits = weight_bits if not super_group_size else 32 # default: same as weight_bits + # Overhead from scales and zero points + aux_total_bits = n_group * (scale_bits + zp_bits) + + # Double quantization case + if super_group_size: + # Number of super-groups + aux_total_bits+=n_group*super_weight_bits * 2 #sclae and min int count + n_super_group = (n_group + super_group_size - 1) // super_group_size + aux_total_bits += n_super_group * 32 * 2 # double quant scale and min_v + + total_bits = weight_total_bits + aux_total_bits + avg_bits = total_bits / n_param + return total_bits, avg_bits From 8ae1dfa56220727b39c2c29eacb22a98ab998e10 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 08:50:03 +0000 Subject: [PATCH 35/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/auto_schemes/gen_scheme.py | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/auto_round/auto_schemes/gen_scheme.py b/auto_round/auto_schemes/gen_scheme.py index 03c253a6a..e009e12de 100644 --- a/auto_round/auto_schemes/gen_scheme.py +++ b/auto_round/auto_schemes/gen_scheme.py @@ -22,13 +22,13 @@ class GenScheme: def __init__( - self, - auto_scheme: AutoScheme, - model: torch.nn.Module, - quant_layer_names: Iterable[str], - fixed_layer_scheme: dict[str, dict], - scale_dtype: str = "fp16", - dataset="pile-10k", + self, + auto_scheme: AutoScheme, + model: torch.nn.Module, + quant_layer_names: Iterable[str], + fixed_layer_scheme: dict[str, dict], + scale_dtype: str = "fp16", + dataset="pile-10k", ): self.auto_scheme = auto_scheme self.model = model @@ -44,14 +44,14 @@ def _get_min_max_avg_bits(self) -> tuple[float, float]: def get_layer_bits(self, layer): weight = layer.weight n_param = weight.numel() - weight_bits = getattr(layer, 'bits', 16) - group_size = getattr(layer, 'group_size', 128) - super_group_size = getattr(layer, 'super_group_size', None) - super_weight_bits = getattr(layer, 'super_bits', None) + weight_bits = getattr(layer, "bits", 16) + group_size = getattr(layer, "group_size", 128) + super_group_size = getattr(layer, "super_group_size", None) + super_weight_bits = getattr(layer, "super_bits", None) # Main quantization cost weight_total_bits = weight_bits * n_param - if weight_bits>=16: # Unquantized layer + if weight_bits >= 16: # Unquantized layer return weight_total_bits, 16 in_features, output_features = get_layer_features(layer) @@ -75,9 +75,9 @@ def get_layer_bits(self, layer): # Double quantization case if super_group_size: # Number of super-groups - aux_total_bits+=n_group*super_weight_bits * 2 #sclae and min int count + aux_total_bits += n_group * super_weight_bits * 2 # sclae and min int count n_super_group = (n_group + super_group_size - 1) // super_group_size - aux_total_bits += n_super_group * 32 * 2 # double quant scale and min_v + aux_total_bits += n_super_group * 32 * 2 # double quant scale and min_v total_bits = weight_total_bits + aux_total_bits avg_bits = total_bits / n_param