Fix default for sequential updates (#186)

dsikka · Kyle Sayers · web-flow · commit 3fb4212fad39 · 2024-09-18T20:10:36.000-04:00
* fix default

* update exmple

* update docstring

---------

Co-authored-by: Kyle Sayers &lt;kyle@neuralmagic.com&gt;
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -6,6 +6,7 @@
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
 model = SparseAutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
@@ -54,7 +55,6 @@ def tokenize(sample):
 
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
-# Note: to reduce GPU memory use `sequential_update=False`
 recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
 
 # Apply algorithms.
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -44,7 +44,7 @@ class GPTQModifier(Modifier):
     | test_stage:
     |    obcq_modifiers:
     |      GPTQModifier:
-    |          sequential_update: True
+    |          sequential_update: true
     |          dampening_frac: 0.001
     |          block_size: 128
     |          config_groups:
@@ -63,7 +63,7 @@ class GPTQModifier(Modifier):
 
 
     :param sequential_update: Whether or not to update weights sequentially by layer,
-        True saves on GPU memory
+        True saves on GPU memory, default is True
     :param targets: list of layer names to compress during GPTQ, or '__ALL__'
         to compress every layer in the model
     :param block_size: Used to determine number of columns to compress in one pass
@@ -93,7 +93,7 @@ class GPTQModifier(Modifier):
         and activation 8 bit quantization on the Linear layers.
     """
 
-    sequential_update: Optional[bool] = False
+    sequential_update: Optional[bool] = True
     targets: Union[str, List[str], None] = None
     sequential_targets: Union[str, List[str], None] = None
     block_size: int = 128