Skip to content

Commit e575558

Browse files
authored
support for gguf mixed q2_k_s (#1059)
* support for gguf mixed q2_k_s Signed-off-by: n1ck-guo <[email protected]>
1 parent e1ec855 commit e575558

File tree

3 files changed

+68
-25
lines changed

3 files changed

+68
-25
lines changed

auto_round/compressors/base.py

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,13 @@
5555
from auto_round.export.export_to_autoround import AutoRoundFormat
5656
from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG, ModelType
5757
from auto_round.logger import logger
58-
from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme
58+
from auto_round.schemes import (
59+
SPECIAL_SCHEMES,
60+
QuantizationScheme,
61+
_handle_special_schemes,
62+
get_gguf_scheme,
63+
preset_name_to_scheme,
64+
)
5965
from auto_round.sign_sgd import SignSGD
6066
from auto_round.special_model_handler import _handle_moe_model
6167
from auto_round.utils import (
@@ -214,6 +220,33 @@ def __init__(
214220
... }
215221
"""
216222

223+
# Model related
224+
model_dtype = kwargs.pop("model_dtype", None)
225+
self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
226+
self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
227+
self.quantized = False
228+
if isinstance(model, str):
229+
model, tokenizer = llm_load_model(
230+
model,
231+
platform=platform,
232+
device="cpu", # always load cpu first
233+
model_dtype=model_dtype,
234+
)
235+
elif tokenizer is None and not self.diffusion and iters > 0:
236+
raise ValueError("A tokenizer must be set for non-str model input")
237+
if unsupported_meta_device(model):
238+
raise RuntimeError(
239+
"AutoRound does not support parameters on meta device. "
240+
"Please use more GPUs by setting `--device 0,1,2,3` or just place the model on CPU."
241+
)
242+
check_and_mark_fp8_model(model)
243+
self.model = model.eval()
244+
self.tokenizer = tokenizer
245+
self.shared_cache_keys = get_shared_keys(self.model)
246+
247+
self.layer_config = layer_config
248+
249+
# should be set after loading model and set layer_config, cause some special scheme need these.
217250
self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(scheme, kwargs)
218251

219252
gguf_scheme_name = get_gguf_scheme(self.scheme)
@@ -244,11 +277,8 @@ def __init__(
244277
platform = "model_scope"
245278
self.platform = platform
246279
self.quant_lm_head = kwargs.pop("quant_lm_head", False)
247-
self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
248-
self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
249280

250281
self.fp_layers = kwargs.pop("fp_layers", "")
251-
self.layer_config = layer_config
252282
self.supported_types = SUPPORTED_LAYER_TYPES
253283
self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
254284
self.scale_dtype = convert_dtype_str2torch(scale_dtype)
@@ -270,27 +300,6 @@ def __init__(
270300
else:
271301
torch.use_deterministic_algorithms(True, warn_only=True)
272302

273-
# Model related
274-
self.quantized = False
275-
if isinstance(model, str):
276-
model, tokenizer = llm_load_model(
277-
model,
278-
platform=platform,
279-
device="cpu", # always load cpu first
280-
model_dtype=model_dtype,
281-
)
282-
elif tokenizer is None and not self.diffusion and iters > 0:
283-
raise ValueError("A tokenizer must be set for non-str model input")
284-
if unsupported_meta_device(model):
285-
raise RuntimeError(
286-
"AutoRound does not support parameters on meta device. "
287-
"Please use more GPUs by setting `--device 0,1,2,3` or just place the model on CPU."
288-
)
289-
check_and_mark_fp8_model(model)
290-
self.model = model.eval()
291-
self.tokenizer = tokenizer
292-
self.shared_cache_keys = get_shared_keys(self.model)
293-
294303
self.to_quant_block_names = to_quant_block_names
295304
if not hasattr(self, "quant_block_list"):
296305
all_blocks = get_block_names(model)
@@ -524,6 +533,8 @@ def _parse_and_set(scheme, kwargs):
524533
scheme = scheme.strip("'\" ")
525534
res = scheme
526535
scheme = scheme.upper()
536+
if scheme in SPECIAL_SCHEMES:
537+
self.layer_config = _handle_special_schemes(scheme, self.layer_config, self.model)
527538
scheme = asdict(preset_name_to_scheme(scheme))
528539
scheme_keys = [f.name for f in fields(QuantizationScheme)]
529540
for key in scheme_keys:
@@ -776,6 +787,8 @@ def remove_duplicates(lst):
776787

777788
if gguf_format_name:
778789
for i in range(len(formats)):
790+
if gguf_format_name.lower().endswith("mixed"):
791+
gguf_format_name = gguf_format_name.lower().replace("_mixed", "_s")
779792
if formats[i] != "fake" and formats[i] != gguf_format_name.lower():
780793
logger.warning(
781794
f"reset format {formats[i]} to {gguf_format_name.lower()} "

auto_round/schemes.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
from dataclasses import dataclass, fields
1717
from typing import Optional, Union
1818

19+
import torch
20+
1921
__all__ = ["QuantizationScheme", "get_gguf_scheme", "preset_name_to_scheme"]
2022

2123

@@ -265,6 +267,25 @@ def is_preset_scheme(name: str) -> bool:
265267
value.pop("lm_head", None)
266268
PRESET_SCHEMES[key.upper()] = QuantizationScheme.from_dict(value)
267269

270+
SPECIAL_SCHEMES = {"GGUF:Q2_K_MIXED": PRESET_SCHEMES["GGUF:Q2_K_S"]}
271+
PRESET_SCHEMES.update(SPECIAL_SCHEMES)
272+
273+
274+
def _handle_special_schemes(scheme_name: str, layer_config: dict, model: torch.nn.Module) -> dict:
275+
"""handle special schemes, like GGUF:Q2_K_MIXED.
276+
Provide some special auto_round recipes.
277+
278+
"""
279+
if scheme_name == "GGUF:Q2_K_MIXED":
280+
for n, m in model.named_modules():
281+
if n in layer_config:
282+
continue
283+
if n == "lm_head" or isinstance(m, torch.nn.Embedding):
284+
layer_config[n] = "GGUF:Q8_0"
285+
elif isinstance(m, torch.nn.Linear) and ("expert" not in n or "shared_experts" in n) and n != "lm_head":
286+
layer_config[n] = "GGUF:Q4_K_S"
287+
return layer_config
288+
268289

269290
def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str:
270291
if isinstance(scheme, str) and scheme.upper().startswith("GGUF"):

test/test_cpu/test_gguf_format.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,15 @@ def test_all_format(self):
309309
assert False, "cmd line test fail, please have a check"
310310
shutil.rmtree("../../tmp_autoround", ignore_errors=True)
311311

312+
# test mixed q2_k_s
313+
res = os.system(
314+
f"cd ../.. && {python_path} -m auto_round --model {model_name}"
315+
f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED"
316+
)
317+
if res > 0 or res == -1:
318+
assert False, "cmd line test fail, please have a check"
319+
shutil.rmtree("../../tmp_autoround", ignore_errors=True)
320+
312321
def test_vlm_gguf(self):
313322
model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct"
314323
from auto_round import AutoRoundMLLM

0 commit comments

Comments
 (0)