File tree Expand file tree Collapse file tree 2 files changed +4
-4
lines changed
examples/quantization_w4a16
src/llmcompressor/modifiers/quantization/gptq Expand file tree Collapse file tree 2 files changed +4
-4
lines changed Original file line number Diff line number Diff line change 66
77# Select model and load it.
88MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
9+
910model = SparseAutoModelForCausalLM .from_pretrained (
1011 MODEL_ID ,
1112 device_map = "auto" ,
@@ -54,7 +55,6 @@ def tokenize(sample):
5455
5556# Configure the quantization algorithm to run.
5657# * quantize the weights to 4 bit with GPTQ with a group size 128
57- # Note: to reduce GPU memory use `sequential_update=False`
5858recipe = GPTQModifier (targets = "Linear" , scheme = "W4A16" , ignore = ["lm_head" ])
5959
6060# Apply algorithms.
Original file line number Diff line number Diff line change @@ -44,7 +44,7 @@ class GPTQModifier(Modifier):
4444 | test_stage:
4545 | obcq_modifiers:
4646 | GPTQModifier:
47- | sequential_update: True
47+ | sequential_update: true
4848 | dampening_frac: 0.001
4949 | block_size: 128
5050 | config_groups:
@@ -63,7 +63,7 @@ class GPTQModifier(Modifier):
6363
6464
6565 :param sequential_update: Whether or not to update weights sequentially by layer,
66- True saves on GPU memory
66+ True saves on GPU memory, default is True
6767 :param targets: list of layer names to compress during GPTQ, or '__ALL__'
6868 to compress every layer in the model
6969 :param block_size: Used to determine number of columns to compress in one pass
@@ -93,7 +93,7 @@ class GPTQModifier(Modifier):
9393 and activation 8 bit quantization on the Linear layers.
9494 """
9595
96- sequential_update : Optional [bool ] = False
96+ sequential_update : Optional [bool ] = True
9797 targets : Union [str , List [str ], None ] = None
9898 sequential_targets : Union [str , List [str ], None ] = None
9999 block_size : int = 128
You can’t perform that action at this time.
0 commit comments