-
Notifications
You must be signed in to change notification settings - Fork 250
[Multi-modifier] Support scoped application of quantization config/status #1772
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
9f0e0ac
14486af
ff5067a
f99db2f
5da7b6d
32ad8dc
faee70f
4db397b
1c7ae4d
64f8f39
a892d2b
1d3eceb
75c7ca6
81cf4a1
af6a34b
2cc681f
855606e
b25d23b
1a6eca7
50fbf15
a0568f7
5e5e0fe
6cd0350
a2377d9
1619337
e21a933
c437c6f
4326ee3
2611ac6
2ea5698
33695d5
170f04b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Quantizing Models with Multiple Quantization Modifiers # | ||
|
||
This section outlines how multiple quantization modifiers can be applied to the same model, for example applying AWQ W4A16 to a model's `self_attn` layers and GPTQ W8A8 to its `mlp` layers. The heterogeneous application of multiple modifiers comes in 2 flavors: | ||
|
||
1. Run every modifier in a single, sequential pipeline, performing a single calibrated run. See `./llama3_example.py` for an example. | ||
2. Run each modifier in its own, independent pipeline, performing a calibrated run for each modifier. To run each modifier independently, run `./llama3_example.py` with `oneshot(..., pipeline="independent")` instead of `pipeline="sequential"`. | ||
|
||
This is an advanced usage of `llm-compressor` and an active area of research. Best practices will be provided in a future release, after further research and sensitivity analysis. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
from datasets import load_dataset | ||
from transformers import AutoModelForCausalLM, AutoTokenizer | ||
|
||
from llmcompressor import oneshot | ||
from llmcompressor.modifiers.awq import AWQMapping, AWQModifier | ||
from llmcompressor.modifiers.quantization import GPTQModifier | ||
from llmcompressor.utils import dispatch_for_generation | ||
|
||
# Select model and load it. | ||
model_id = "meta-llama/Meta-Llama-3-8B-Instruct" | ||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") | ||
tokenizer = AutoTokenizer.from_pretrained(model_id) | ||
|
||
# Select calibration dataset. | ||
DATASET_ID = "HuggingFaceH4/ultrachat_200k" | ||
DATASET_SPLIT = "train_sft" | ||
|
||
# Select number of samples. 512 samples is a good place to start. | ||
# Increasing the number of samples can improve accuracy. | ||
NUM_CALIBRATION_SAMPLES = 512 | ||
MAX_SEQUENCE_LENGTH = 2048 | ||
|
||
# Load dataset and preprocess. | ||
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") | ||
ds = ds.shuffle(seed=42) | ||
|
||
|
||
def preprocess(example): | ||
return { | ||
"text": tokenizer.apply_chat_template( | ||
example["messages"], | ||
tokenize=False, | ||
) | ||
} | ||
|
||
|
||
ds = ds.map(preprocess) | ||
|
||
|
||
# Tokenize inputs. | ||
def tokenize(sample): | ||
return tokenizer( | ||
sample["text"], | ||
padding=False, | ||
max_length=MAX_SEQUENCE_LENGTH, | ||
truncation=True, | ||
add_special_tokens=False, | ||
) | ||
|
||
|
||
ds = ds.map(tokenize, remove_columns=ds.column_names) | ||
|
||
# Configure the quantization algorithm to run. | ||
# * quantize self_attn layers to W8A8 with GPTQ | ||
# * quantize mlp layers to W4A16 with AWQ | ||
# only include mappings pertaining to target layers | ||
recipe = [ | ||
GPTQModifier(targets=r"re:.*self_attn\.(k|q|o|v)_proj$", scheme="W8A8"), | ||
AWQModifier( | ||
targets=r"re:.*mlp\.(down|gate|up)_proj$", | ||
mappings=[ | ||
AWQMapping( | ||
"re:.*post_attention_layernorm$", | ||
["re:.*gate_proj$", "re:.*up_proj$"], | ||
), | ||
AWQMapping( | ||
"re:.*up_proj$", | ||
["re:.*down_proj$"], | ||
), | ||
], | ||
scheme="W4A16", | ||
), | ||
] | ||
|
||
# Apply algorithms. | ||
oneshot( | ||
model=model, | ||
dataset=ds, | ||
recipe=recipe, | ||
max_seq_length=MAX_SEQUENCE_LENGTH, | ||
num_calibration_samples=NUM_CALIBRATION_SAMPLES, | ||
# Option 1) run both modifiers in a single calibrated run | ||
pipeline="sequential", | ||
brian-dellabetta marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Option 2) run each modifier in its own separate pipeline | ||
# pipeline="independent", | ||
) | ||
|
||
# Confirm generations of the quantized model look sane. | ||
print("\n\n") | ||
print("========== SAMPLE GENERATION ==============") | ||
dispatch_for_generation(model) | ||
sample = tokenizer("Hello my name is", return_tensors="pt") | ||
sample = {key: value.to(model.device) for key, value in sample.items()} | ||
output = model.generate(**sample, max_new_tokens=100) | ||
print(tokenizer.decode(output[0])) | ||
print("==========================================\n\n") | ||
|
||
# Save to disk compressed. | ||
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" | ||
brian-dellabetta marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
model.save_pretrained(SAVE_DIR, save_compressed=True) | ||
tokenizer.save_pretrained(SAVE_DIR) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ | |
is_preset_scheme, | ||
preset_name_to_scheme, | ||
) | ||
from compressed_tensors.utils import match_named_modules | ||
from pydantic import Field, PrivateAttr, field_validator | ||
from torch.utils.hooks import RemovableHandle | ||
|
||
|
@@ -116,41 +117,49 @@ def validate_scheme( | |
|
||
def initialize_quantization(self, model: torch.nn.Module): | ||
""" | ||
Attach quantization schemes and observers to modules in the model according to | ||
Attach quantization schemes to modules in the model according to | ||
the quantization config specified on this modifier | ||
|
||
:param model: model to attach schemes and observers to | ||
""" | ||
reset_quantization_status(model) # reset any previously applied qconfigs | ||
|
||
# apply scheme and status to model | ||
config = self.resolve_quantization_config() | ||
|
||
for _, module in match_named_modules(model, self.targets, self.ignore): | ||
reset_quantization_status(module) # reset any previously applied qconfigs | ||
|
||
apply_quantization_config(model, config) | ||
|
||
# apply observers, disable quantization until calibration | ||
model.apply(self._initialize_observers) | ||
# TODO should we disable for entire model or just matching modules? | ||
# disable quantization until calibration | ||
model.apply(disable_quantization) | ||
|
||
def start_calibration(self, model: torch.nn.Module): | ||
""" | ||
Register activation calibration hooks (including kv_cache quantization) and | ||
enable quantization as we calibrate | ||
Attach observers, register activation calibration hooks (including | ||
kv_cache quantization) and enable quantization as we calibrate | ||
|
||
:param model: model to prepare for calibration | ||
""" | ||
self._calibration_hooks = self._initialize_hooks(model) | ||
model.apply(apply_calibration_status) | ||
for _, module in match_named_modules(model, self.targets, self.ignore): | ||
self._initialize_observers(module) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't we keep this iniitialize_quantization? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. observers should be initialized on start to align with them being removed on_end. so this was moved into on_start instead. without this change the lifecycle with multiple quant modifiers will trigger observer hooks before the modifier starts (before it sees any data), which can now happen in a previous modifier lifecycle |
||
apply_calibration_status(module) | ||
|
||
# TODO should we disable for entire model or just matching modules? | ||
model.apply(enable_quantization) # quantize at the same time as calibrate | ||
|
||
def end_calibration(self, model: torch.nn.Module): | ||
""" | ||
Remove calibration hooks and set the model status to frozen. Keep quantization | ||
enabled for future operations | ||
Remove calibration hooks and observers, and set the model status to frozen. | ||
Keep quantization enabled for future operations | ||
|
||
:param model: model to end calibration for | ||
""" | ||
self.remove_hooks(self._calibration_hooks) | ||
model.apply(freeze_module_quantization) # remove observers | ||
for _, module in match_named_modules(model, self.targets, self.ignore): | ||
freeze_module_quantization(module) # remove observers | ||
|
||
model.apply(enable_quantization) # keep quantization enabled | ||
|
||
def has_config(self) -> bool: | ||
|
@@ -240,7 +249,7 @@ def _initialize_observers(self, module: torch.nn.Module): | |
|
||
def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]: | ||
hooks = set() | ||
for module in model.modules(): | ||
for _, module in match_named_modules(model, self.targets, self.ignore): | ||
if not hasattr(module, "quantization_scheme"): | ||
continue | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.