Add more improvements and requirements.txt

mgoin · mgoin · commit a95b81dc8158 · 2024-04-25T21:34:47.000Z
diff --git a/quantize.py b/quantize.py
@@ -1,4 +1,5 @@
 import argparse
+import gc
 import re
 from typing import Tuple
 
@@ -10,6 +11,7 @@
 
 
 # HACK: override the dtype_byte_size function in transformers to support float8 types
+# Fix is posted upstream https://github.com/huggingface/transformers/pull/30488
 def new_dtype_byte_size(dtype):
     if dtype == torch.bool:
         return 1 / 8
@@ -23,6 +25,11 @@ def new_dtype_byte_size(dtype):
 transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size
 
 
+def cleanup_memory():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
 def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
     """Quantize a tensor using per-tensor static scaling factor.
 
@@ -33,7 +40,14 @@ def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
     # Calculate the scale as dtype max divided by absmax.
     # Since .abs() creates a new tensor, we use aminmax to get
     # the min and max first and then calculate the absmax.
-    min_val, max_val = tensor.aminmax()
+    if tensor.numel() == 0:
+        # Deal with empty tensors (triggered by empty MoE experts)
+        min_val, max_val = (
+            torch.tensor(0.0, dtype=tensor.dtype),
+            torch.tensor(1.0, dtype=tensor.dtype),
+        )
+    else:
+        min_val, max_val = tensor.aminmax()
     amax = min_val.abs().max(max_val.abs())
     scale = finfo.max / amax.clamp(min=1e-12)
     # scale and clamp the tensor to bring it to
@@ -145,68 +159,80 @@ def forward(self, x):
         return output
 
 
+def replace_module(model, name, new_module):
+    if "." in name:
+        parent_name = name.rsplit(".", 1)[0]
+        child_name = name[len(parent_name) + 1 :]
+        parent = model.model.get_submodule(parent_name)
+    else:
+        parent_name = ""
+        parent = model.model
+        child_name = name
+    setattr(parent, child_name, new_module)
+
+
 def quantize_weights(model):
     for name, linear in model.model.named_modules():
+        # if "gate" in name or not isinstance(linear, torch.nn.Linear):
         if not isinstance(linear, torch.nn.Linear):
             continue
         quant_weight, quant_scale = per_tensor_quantize(linear.weight)
         quant_linear = FP8DynamicLinear(quant_weight, quant_scale)
-        if "." in name:
-            parent_name = name.rsplit(".", 1)[0]
-            child_name = name[len(parent_name) + 1 :]
-            parent = model.model.get_submodule(parent_name)
-        else:
-            parent_name = ""
-            parent = model.model
-            child_name = name
-        setattr(parent, child_name, quant_linear)
+        replace_module(model, name, quant_linear)
+        del linear
+    cleanup_memory()
 
 
 def quantize_activations(model, calibration_tokens):
     # Replace layers with quantizer.
     for name, dynamic_quant_linear in model.model.named_modules():
+        # if "gate" in name or not isinstance(dynamic_quant_linear, FP8DynamicLinear):
         if not isinstance(dynamic_quant_linear, FP8DynamicLinear):
             continue
         quantizer = FP8StaticLinearQuantizer(
             dynamic_quant_linear.weight, dynamic_quant_linear.weight_scale
         )
-        if "." in name:
-            parent_name = name.rsplit(".", 1)[0]
-            child_name = name[len(parent_name) + 1 :]
-            parent = model.model.get_submodule(parent_name)
-        else:
-            parent_name = ""
-            parent = model.model
-            child_name = name
-        setattr(parent, child_name, quantizer)
+        replace_module(model, name, quantizer)
+        del dynamic_quant_linear
+    cleanup_memory()
 
     # Calibration.
     for row_idx in range(calibration_tokens.shape[0]):
         _ = model(calibration_tokens[row_idx].reshape(1, -1))
 
     # Replace quantizer with StaticLayer.
     for name, quantizer in model.model.named_modules():
+        # if "gate" in name or not isinstance(quantizer, FP8StaticLinearQuantizer):
         if not isinstance(quantizer, FP8StaticLinearQuantizer):
             continue
         static_proj = FP8StaticLinear(
             quantizer.weight, quantizer.weight_scale, quantizer.act_scale
         )
-        if "." in name:
-            parent_name = name.rsplit(".", 1)[0]
-            child_name = name[len(parent_name) + 1 :]
-            parent = model.model.get_submodule(parent_name)
-        else:
-            parent_name = ""
-            parent = model.model
-            child_name = name
-        setattr(parent, child_name, static_proj)
+        replace_module(model, name, static_proj)
+        del quantizer
+    cleanup_memory()
+
+
+def save_quantized_model(model, activation_scheme, save_dir):
+    print(f"Saving the model to {save_dir}")
+    static_q_dict = {
+        "quantization_config": {
+            "quant_method": "fp8",
+            "activation_scheme": activation_scheme,
+        }
+    }
+    model.config.update(static_q_dict)
+    model.save_pretrained(save_dir)
+    tokenizer.save_pretrained(save_dir)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--model-id", type=str)
     parser.add_argument("--save-dir", type=str)
-    # parser.add_argument("--static-act", action="store_true")
+    parser.add_argument(
+        "--activation-scheme", type=str, default="static", choices=["static", "dynamic"]
+    )
     parser.add_argument("--num-samples", type=int, default=512)
     parser.add_argument("--max-seq-len", type=int, default=512)
     args = parser.parse_args()
@@ -240,22 +266,26 @@ def quantize_activations(model, calibration_tokens):
     model = AutoModelForCausalLM.from_pretrained(
         args.model_id, torch_dtype="auto", device_map="auto"
     )
+    print("Original model graph:\n", model)
     output = model.generate(input_ids=sample_input_tokens, max_new_tokens=20)
-    print("ORIGINAL:\n", tokenizer.decode(output[0]), "\n\n")
+    print("ORIGINAL OUTPUT:\n", tokenizer.decode(output[0]), "\n\n")
 
     # Quantize weights.
     quantize_weights(model)
+    print("Weight-quantized model graph:\n", model)
     output = model.generate(input_ids=sample_input_tokens, max_new_tokens=20)
-    print("WEIGHT QUANT:\n", tokenizer.decode(output[0]), "\n\n")
+    print("WEIGHT QUANT OUTPUT:\n", tokenizer.decode(output[0]), "\n\n")
 
-    # Quantize activations.
-    quantize_activations(model, calibration_tokens=calibration_tokens)
-    output = model.generate(input_ids=sample_input_tokens, max_new_tokens=20)
-    print("ACT QUANT:\n", tokenizer.decode(output[0]), "\n\n")
-
-    # Save the model fully quantized
-    print(f"Saving the model to {args.save_dir}")
-    static_q_dict = {"quantization_config": {"quant_method": "fp8", "scheme": "static"}}
-    model.config.update(static_q_dict)
-    model.save_pretrained(args.save_dir)
-    tokenizer.save_pretrained(args.save_dir)
+    if args.activation_scheme in "dynamic":
+        print("Exporting model with static weights and dynamic activations")
+        save_quantized_model(model, args.activation_scheme, args.save_dir)
+    else:
+        assert args.activation_scheme in "static"
+        # Quantize activations.
+        quantize_activations(model, calibration_tokens=calibration_tokens)
+        print("Weight and activation quantized model graph:\n", model)
+        output = model.generate(input_ids=sample_input_tokens, max_new_tokens=20)
+        print("ACT QUANT OUTPUT:\n", tokenizer.decode(output[0]), "\n\n")
+
+        print("Exporting model with static weights and static activations")
+        save_quantized_model(model, args.activation_scheme, args.save_dir)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+torch>=2.2
+transformers
+datasets
+accelerate