diff --git a/examples/quantization_non_uniform/README.md b/examples/quantization_non_uniform/README.md index c2e50bdbf..f9dd5b3eb 100644 --- a/examples/quantization_non_uniform/README.md +++ b/examples/quantization_non_uniform/README.md @@ -9,3 +9,12 @@ We demonstrate mixed precision by quantizing models to both int8 and int4, and i ## Multiple Strategies It may also be interesting to quantize a model with two different [quantization strategies](https://github.com/neuralmagic/compressed-tensors/blob/a2bfc03e9d52824ba5d6d2a50c8741dd9bccd5d3/src/compressed_tensors/quantization/quant_args.py#L93) such as group, channel, or per-tensor. [Here](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py) we apply fp8 quantization where all the attention weights are quantized using the per-channel strategy, and all the mlp weights are quantized using per-tensor. This is accomplished through defining multiple config groups in the recipe. The produced model is compressed using the `float-quantized` compressor and can be directly run in vllm. + +## Quantization with Multiple Quantization Modifiers + +This section outlines how multiple quantization modifiers can be applied to the same model for mixed-precision quantization, for example applying AWQ W4A16 to a model's `self_attn` layers and GPTQ W8A8 to its `mlp` layers. This heterogeneous application of multiple modifiers comes in 2 flavors: + +1. Run every modifier in a single, sequential pipeline, performing a single calibrated run. See `./quantization_multiple_modifiers.py` for an example. +2. Run each modifier in its own, independent pipeline, performing a calibrated run for each modifier. To run each modifier independently, run `./quantization_multiple_modifiers.py` with `oneshot(..., pipeline="independent")` instead of `pipeline="sequential"`. + +This is an advanced usage of `llm-compressor` and an active area of research. Best practices will be provided in a future release, after further research and sensitivity analysis. \ No newline at end of file diff --git a/examples/quantization_non_uniform/quantization_multiple_modifiers.py b/examples/quantization_non_uniform/quantization_multiple_modifiers.py new file mode 100644 index 000000000..d4b8d9175 --- /dev/null +++ b/examples/quantization_non_uniform/quantization_multiple_modifiers.py @@ -0,0 +1,101 @@ +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.awq import AWQMapping, AWQModifier +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils import dispatch_for_generation + +# Select model and load it. +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Select number of samples. 512 samples is a good place to start. +# Increasing the number of samples can improve accuracy. +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Configure the quantization algorithm to run. +# * quantize self_attn layers to W8A8 with GPTQ +# * quantize mlp layers to W4A16 with AWQ +# only include mappings pertaining to target layers +recipe = [ + GPTQModifier(targets=r"re:.*self_attn\.(k|q|o|v)_proj$", scheme="W8A8"), + AWQModifier( + targets=r"re:.*mlp\.(down|gate|up)_proj$", + mappings=[ + AWQMapping( + "re:.*post_attention_layernorm$", + ["re:.*gate_proj$", "re:.*up_proj$"], + ), + AWQMapping( + "re:.*up_proj$", + ["re:.*down_proj$"], + ), + ], + scheme="W4A16", + ), +] + +# Apply algorithms. +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + # Option 1) run both modifiers in a single calibrated run + pipeline="sequential", + # Option 2) run each modifier in its own separate pipeline + # pipeline="independent", +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to(model.device) for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-gptq-w8a8-self_attn-awq-w4a16-mlp" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 7d45820de..49698fe09 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -265,8 +265,10 @@ def on_end(self, state: State, event: Event, **kwargs): self.ended_ = True - modules = list(state.model.modules()) - for module in tqdm(modules, desc="Calibrating weights"): + for _, module in tqdm( + match_named_modules(state.model, self.targets, self.ignore), + desc="Calibrating weights", + ): update_weight_zp_scale(module) QuantizationMixin.end_calibration(self, state.model) diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index d446dd324..ba9cd122e 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -157,6 +157,7 @@ class AWQMapping: "Phi3ForCausalLM": _phi_mappings, "Phi3VForCausalLM": _phi_mappings, "Qwen2ForCausalLM": _default_mappings, + "Qwen2_5OmniThinkerForConditionalGeneration": _default_mappings, "Qwen2MoeForCausalLM": _moe_default_mappings, "Qwen3ForCausalLM": _default_mappings, "Qwen3MoeForCausalLM": _moe_default_mappings, diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py index 1341f0956..b09b41812 100644 --- a/src/llmcompressor/modifiers/quantization/cache.py +++ b/src/llmcompressor/modifiers/quantization/cache.py @@ -9,8 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple -from compressed_tensors.quantization import KVCacheScaleType -from compressed_tensors.quantization.quant_args import QuantizationArgs +from compressed_tensors.quantization import KVCacheScaleType, QuantizationArgs from torch import Tensor from transformers import DynamicCache diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index bee22fe6e..584f17643 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -13,6 +13,7 @@ align_module_device, get_execution_device, getattr_chain, + match_named_modules, update_offload_parameter, ) from loguru import logger @@ -161,7 +162,10 @@ def on_initialize(self, state: State, **kwargs) -> bool: QuantizationMixin.initialize_quantization(self, state.model) # prepare module names - self._module_names = {m: name for name, m in state.model.named_modules()} + self._module_names = { + m: name + for name, m in match_named_modules(state.model, self.targets, self.ignore) + } return True @@ -174,7 +178,7 @@ def on_start(self, state: State, event: Event, **kwargs): # register gptq hooks added_hook = False - for module in state.model.modules(): + for _, module in match_named_modules(state.model, self.targets, self.ignore): if getattr_chain(module, "quantization_scheme.weights", None) is not None: # HACK: previously, embeddings were not quantized because they were not # accessible by the layer compressor. For now, we manually ignore it, diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index 07332f214..aa6208da4 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -1,4 +1,5 @@ import tqdm +from compressed_tensors.utils import match_named_modules from llmcompressor.core import Event, EventType, State from llmcompressor.modifiers import Modifier @@ -69,14 +70,16 @@ def on_start(self, state: State, event: Event, **kwargs): self.started_ = True QuantizationMixin.start_calibration(self, state.model) - modules = list(state.model.modules()) + named_modules = list( + match_named_modules(state.model, self.targets, self.ignore) + ) # TODO: this step can be combined with update_weight_zp_scale # once update_fused_layer_weight_global_scales is removed # and not required by vLLM - for module in tqdm.tqdm(modules): + for _, module in tqdm.tqdm(named_modules): update_weight_global_scale(module) - for module in tqdm.tqdm(modules, desc="Calibrating weights"): + for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"): update_fused_layer_weight_global_scales(module) update_weight_zp_scale(module) diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index d193d85a1..f37efb56a 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -14,7 +14,8 @@ is_preset_scheme, preset_name_to_scheme, ) -from pydantic import Field, PrivateAttr, field_validator +from compressed_tensors.utils import match_named_modules +from pydantic import Field, PrivateAttr, field_validator, model_validator from torch.utils.hooks import RemovableHandle from llmcompressor.modifiers.quantization.calibration import ( @@ -58,8 +59,9 @@ class QuantizationMixin(HooksMixin): :param config_groups: dictionary specifying quantization schemes to apply to target modules. Modules not matching a scheme target will NOT be quantized. - :param targets: list of layer names to quantize if a scheme is provided. Defaults - to Linear layers + :param targets: list of layer names to quantize if a scheme is provided. If unset, + will contain all targets listed in config_groups. If config_groups is also + unset, will default to ["Linear"] (i.e. all Linear layers will be targeted). :param ignore: optional list of module class names or submodule names to not quantize even if they match a target in config_groups. Defaults to empty list. :param scheme: a single quantization scheme to apply to the model. This is a @@ -81,7 +83,7 @@ class QuantizationMixin(HooksMixin): """ config_groups: Optional[Dict[str, QuantizationScheme]] = None - targets: Union[str, List[str]] = Field(default_factory=lambda: ["Linear"]) + targets: Union[str, List[str]] = Field(default_factory=list) ignore: List[str] = Field(default_factory=list) scheme: Optional[Union[str, Dict[str, Any]]] = None kv_cache_scheme: Optional[QuantizationArgs] = None @@ -114,43 +116,71 @@ def validate_scheme( return value + @model_validator(mode="after") + def validate_model_after(model: "QuantizationMixin") -> "QuantizationMixin": + """ + - If targets have not been set, aggregate targets from config_groups + into a single unique list + - If targets have still not been found, default to targets=["Linear"] + """ + + if len(model.targets) > 0 and model.config_groups is not None: + raise ValueError("Please specify either `targets` or `config_groups`") + + if len(model.targets) == 0 and model.config_groups is not None: + for config_group in model.config_groups.values(): + for target in config_group.targets: + if target not in model.targets: + model.targets.append(target) + + if len(model.targets) == 0: + model.targets.append("Linear") + + return model + def initialize_quantization(self, model: torch.nn.Module): """ - Attach quantization schemes and observers to modules in the model according to + Attach quantization schemes to modules in the model according to the quantization config specified on this modifier :param model: model to attach schemes and observers to """ - reset_quantization_status(model) # reset any previously applied qconfigs - # apply scheme and status to model config = self.resolve_quantization_config() + + for _, module in match_named_modules(model, self.targets, self.ignore): + reset_quantization_status(module) # reset any previously applied qconfigs + apply_quantization_config(model, config) - # apply observers, disable quantization until calibration - model.apply(self._initialize_observers) + # disable quantization until calibration model.apply(disable_quantization) def start_calibration(self, model: torch.nn.Module): """ - Register activation calibration hooks (including kv_cache quantization) and - enable quantization as we calibrate + Attach observers, register activation calibration hooks (including + kv_cache quantization) and enable quantization as we calibrate :param model: model to prepare for calibration """ self._calibration_hooks = self._initialize_hooks(model) - model.apply(apply_calibration_status) + for _, module in match_named_modules(model, self.targets, self.ignore): + self._initialize_observers(module) + apply_calibration_status(module) + model.apply(enable_quantization) # quantize at the same time as calibrate def end_calibration(self, model: torch.nn.Module): """ - Remove calibration hooks and set the model status to frozen. Keep quantization - enabled for future operations + Remove calibration hooks and observers, and set the model status to frozen. + Keep quantization enabled for future operations :param model: model to end calibration for """ self.remove_hooks(self._calibration_hooks) - model.apply(freeze_module_quantization) # remove observers + for _, module in match_named_modules(model, self.targets, self.ignore): + freeze_module_quantization(module) # remove observers + model.apply(enable_quantization) # keep quantization enabled def has_config(self) -> bool: @@ -240,7 +270,7 @@ def _initialize_observers(self, module: torch.nn.Module): def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]: hooks = set() - for module in model.modules(): + for _, module in match_named_modules(model, self.targets, self.ignore): if not hasattr(module, "quantization_scheme"): continue diff --git a/tests/llmcompressor/modifiers/calibration/test_frozen.py b/tests/llmcompressor/modifiers/calibration/test_frozen.py index 4b89a0084..9a9ea6a18 100644 --- a/tests/llmcompressor/modifiers/calibration/test_frozen.py +++ b/tests/llmcompressor/modifiers/calibration/test_frozen.py @@ -37,7 +37,7 @@ def test_set_module_for_calibration(): layer = Linear(4, 4) initialize_module_for_quantization(layer, quantization_scheme) - layer.quantization_status = QuantizationStatus("calibration") + layer.quantization_status = QuantizationStatus.CALIBRATION initialize_observer(layer, "weight") # should have both input and weight observer after initalizing @@ -48,4 +48,4 @@ def test_set_module_for_calibration(): assert not hasattr(layer, "input_observer") assert not hasattr(layer, "weight_observer") - assert layer.quantization_status == QuantizationStatus("frozen") + assert layer.quantization_status == QuantizationStatus.FROZEN diff --git a/tests/llmcompressor/modifiers/quantization/test_base.py b/tests/llmcompressor/modifiers/quantization/test_base.py index 931f7deb6..ce62115fd 100644 --- a/tests/llmcompressor/modifiers/quantization/test_base.py +++ b/tests/llmcompressor/modifiers/quantization/test_base.py @@ -95,12 +95,11 @@ def test_block_strategy_parsing(block_q_config_kwargs): def test_actorder_resolution( has_actorder, actorder, q_config_kwargs, expected_0, expected_1 ): - if has_actorder: - modifier = GPTQModifier(**q_config_kwargs, actorder=actorder) - else: - modifier = GPTQModifier(**q_config_kwargs) - with pytest.raises(ValueError) if expected_0 == "error" else nullcontext(): + if has_actorder: + modifier = GPTQModifier(**q_config_kwargs, actorder=actorder) + else: + modifier = GPTQModifier(**q_config_kwargs) resolved = modifier.resolve_quantization_config() if expected_0 != "error": @@ -155,8 +154,8 @@ def test_config_resolution(strategies, actorder): ) def test_serialize_actorder(has_actorder, actorder, exp_actorder): if has_actorder: - modifier = GPTQModifier(targets=["Linear"], actorder=actorder) + modifier = GPTQModifier(targets=["Linear"], scheme="W8A8", actorder=actorder) else: - modifier = GPTQModifier(targets=["Linear"]) + modifier = GPTQModifier(targets=["Linear"], scheme="W8A8") assert modifier.model_dump()["actorder"] == exp_actorder diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml index 67aa5df3f..127a830c3 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml @@ -14,4 +14,3 @@ test_stage: targets: ["Linear", "Embedding"] GPTQModifier: block_size: 128 - targets: ["re:model.layers.\\d+$"] \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py index a4841a0b4..72366fddc 100644 --- a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py +++ b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py @@ -184,10 +184,14 @@ def test_quant_model_reload(format, dtype, tmp_path): og_state_dict = model.state_dict() save_path_compressed = tmp_path / "compressed" - for _, module in model.named_modules(): + for name, module in model.named_modules(): if hasattr(module, "quantization_scheme"): - assert module.weight.dtype == dtype - assert module.quantization_status == QuantizationStatus.FROZEN + assert ( + module.weight.dtype == dtype + ), f"Module {name} has incorrect weight dtype" + assert ( + module.quantization_status == QuantizationStatus.FROZEN + ), f"Module {name} has incorrect quantization status" # Save to disk model.save_pretrained(