Support calibrating kv cache scales

mgoin · mgoin · commit d0dd9d05b6be · 2024-07-18T17:07:03.000-04:00
diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py
@@ -28,7 +28,11 @@ def __init__(
         )
 
         if quantize_config.kv_cache_quant_targets:
+<<<<<<< HEAD
             kv_cache_quant_layers = get_kv_cache_quant_layers(
+=======
+            kv_cache_quant_layers = get_kv_cache_quant_layer(
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
                 self.model, quantize_config.kv_cache_quant_targets
             )
             if len(kv_cache_quant_layers) == 0:
@@ -108,6 +112,13 @@ def skip(*args, **kwargs):
         return cls(model, quantize_config)
 
     def quantize(self, calibration_tokens: Optional[torch.Tensor] = None):
+<<<<<<< HEAD
+=======
+        def _prepare_calibration_data(calibration_tokens):
+            if hasattr(calibration_tokens, "input_ids"):
+                return calibration_tokens.input_ids
+            return calibration_tokens
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
 
         # Always quantize the weights as they do not require calibration data
         quantize_weights(self.model, self.quantize_config)
@@ -116,13 +127,16 @@ def quantize(self, calibration_tokens: Optional[torch.Tensor] = None):
             assert (
                 calibration_tokens is not None
             ), "Calibration tokens required for activation quantization"
+<<<<<<< HEAD
 
 
             def _prepare_calibration_data(calibration_tokens):
                 if hasattr(calibration_tokens, "input_ids"):
                     return calibration_tokens.input_ids
                 return calibration_tokens
 
+=======
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
             quantize_activations(
                 self.model,
                 self.quantize_config,
@@ -159,15 +173,26 @@ def get_layers_to_ignore(model, ignore_patterns) -> List[str]:
     return list(ignored_layers)
 
 
+<<<<<<< HEAD
 def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
     kv_cache_quant_layers = []
+=======
+def get_kv_cache_quant_layer(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
+    kv_cache_quant_layers = set()
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
 
     for name, linear in model.named_modules():
         if not isinstance(linear, torch.nn.Linear):
             continue
 
         for output_quant_target in kv_cache_quant_targets:
             if name.endswith(output_quant_target):
+<<<<<<< HEAD
                 kv_cache_quant_layers.append(name)
 
     return kv_cache_quant_layers
+=======
+                kv_cache_quant_layers.add(name)
+
+    return list(kv_cache_quant_layers)
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py
@@ -72,11 +72,19 @@ def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype):
         # Deal with empty tensors (triggeted by empty MoE experts)
         return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device)
 
+<<<<<<< HEAD
     # TODO: Disable native fp8 gemm for now, always just dequantize
     # native_fp8_support = (
     #     torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
     # )
     native_fp8_support = False
+=======
+    native_fp8_support = (
+        torch.cuda.is_available()
+        and torch.cuda.get_device_capability() >= (8, 9)
+        and False
+    )
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
     if native_fp8_support:
         need_reshape = A.dim() == 3
         if need_reshape:
@@ -108,6 +116,7 @@ def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype):
 
 # Class responsible for quantizing weights
 class FP8DynamicLinear(torch.nn.Module):
+<<<<<<< HEAD
     def __init__(
         self,
         weight: torch.Tensor,
@@ -125,13 +134,114 @@ def forward(self, x):
             A=qinput,
             A_scale=x_scale,
             B=self.weight,
+=======
+    def __init__(
+        self,
+        qweight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        bias: torch.nn.Parameter,
+    ):
+        super().__init__()
+        self.qweight = torch.nn.Parameter(qweight, requires_grad=False)
+        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+        self.bias = bias
+
+    def forward(self, x):
+        qinput, x_scale = per_tensor_quantize(x)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=x_scale,
+            B=self.qweight,
             B_scale=self.weight_scale,
             bias=self.bias,
             out_dtype=x.dtype,
         )
         return output
 
 
+# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales) using an activation observer
+class FP8StaticLinearQuantizer(torch.nn.Module):
+    def __init__(
+        self,
+        qweight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        bias: torch.nn.Parameter,
+        quantize_output: bool = False,
+    ):
+        super().__init__()
+        self.qweight = torch.nn.Parameter(qweight, requires_grad=False)
+        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+        self.bias = bias
+        self.input_scale = None
+        self.output_scale = None
+        self.quantize_output = quantize_output
+
+    def forward(self, x):
+        qinput, x_input_scale = per_tensor_quantize(x)
+        if self.input_scale is None:
+            self.input_scale = torch.nn.Parameter(x_input_scale)
+        elif x_input_scale > self.input_scale:
+            self.input_scale = torch.nn.Parameter(x_input_scale)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=self.input_scale,
+            B=self.qweight,
+            B_scale=self.weight_scale,
+            bias=self.bias,
+            out_dtype=x.dtype,
+        )
+
+        # Optionally, quantize output and record scale
+        if self.quantize_output:
+            qoutput, output_scale = per_tensor_quantize(output)
+            if self.output_scale is None:
+                self.output_scale = torch.nn.Parameter(output_scale)
+            elif output_scale > self.output_scale:
+                self.output_scale = torch.nn.Parameter(output_scale)
+            output = qoutput.to(output.dtype) * output_scale
+
+        return output
+
+
+# Module responsible for representing the final checkpoint representation
+class FP8StaticLinear(torch.nn.Module):
+    def __init__(
+        self,
+        qweight: torch.nn.Parameter,
+        weight_scale: torch.nn.Parameter,
+        bias: torch.nn.Parameter,
+        input_scale: torch.nn.Parameter,
+        output_scale: Optional[torch.nn.Parameter] = None,
+    ):
+        super().__init__()
+        self.qweight = qweight
+        self.weight_scale = weight_scale
+        self.bias = bias
+        self.input_scale = input_scale
+        self.output_scale = output_scale
+
+    def forward(self, x):
+        qinput = static_per_tensor_quantize(x, self.input_scale)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=self.input_scale,
+            B=self.qweight,
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+            B_scale=self.weight_scale,
+            bias=self.bias,
+            out_dtype=x.dtype,
+        )
+<<<<<<< HEAD
+=======
+
+        if self.output_scale:
+            qoutput = static_per_tensor_quantize(output, self.output_scale)
+            output = qoutput.to(output.dtype) * self.output_scale
+
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
+        return output
+
+
 # Module responsible for taking already quantized weights, and recording input scales (and possibly output scales) using an activation observer
 class FP8StaticLinearQuantizer(torch.nn.Module):
     def __init__(
@@ -237,7 +347,11 @@ def quantize_weights(
         quant_weight, weight_scale = per_tensor_quantize(linear.weight)
         bias = copy.deepcopy(linear.bias) if linear.bias is not None else None
         quant_linear = FP8DynamicLinear(
+<<<<<<< HEAD
             weight=quant_weight, weight_scale=weight_scale, bias=bias
+=======
+            qweight=quant_weight, weight_scale=weight_scale, bias=bias
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
         )
         replace_module(model, name, quant_linear)
         del linear.weight
@@ -259,7 +373,11 @@ def quantize_activations(
         ):
             continue
         quantizer = FP8StaticLinearQuantizer(
+<<<<<<< HEAD
             weight=dynamic_quant_linear.weight,
+=======
+            qweight=dynamic_quant_linear.qweight,
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
             weight_scale=dynamic_quant_linear.weight_scale,
             bias=dynamic_quant_linear.bias,
             quantize_output=(
@@ -272,12 +390,22 @@ def quantize_activations(
     cleanup_memory()
 
     # Pass through calibration data to measure activation scales
+<<<<<<< HEAD
     with torch.inference_mode():
         with tqdm.tqdm(total=calibration_tokens.shape[0], desc="Calibrating activation scales") as pbar:
             for row_idx in range(calibration_tokens.shape[0]):
                 model(calibration_tokens[row_idx].reshape(1, -1))
                 cleanup_memory()
                 pbar.update(1)
+=======
+    with tqdm.tqdm(
+        total=calibration_tokens.shape[0], desc="Calibrating activation scales"
+    ) as pbar:
+        for row_idx in range(calibration_tokens.shape[0]):
+            model(calibration_tokens[row_idx].reshape(1, -1))
+            cleanup_memory()
+            pbar.update(1)
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
 
     # Replace dynamic quantizer observer with StaticLinear for export
     for name, quantizer in model.named_modules():
@@ -287,7 +415,11 @@ def quantize_activations(
         ):
             continue
         static_proj = FP8StaticLinear(
+<<<<<<< HEAD
             weight=quantizer.weight,
+=======
+            qweight=quantizer.qweight,
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
             weight_scale=quantizer.weight_scale,
             bias=quantizer.bias,
             input_scale=quantizer.input_scale,
diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py
@@ -1,7 +1,10 @@
 import os
 import shutil
 
+<<<<<<< HEAD
 import pytest
+=======
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
 import safetensors.torch
 from transformers import AutoTokenizer
 
@@ -12,9 +15,15 @@
     ("Qwen/Qwen2-0.5B-Instruct", 620),
 ]
 
+<<<<<<< HEAD
 @pytest.mark.parametrize("model_id,target_size", MODELS)
 def test_dynamic_quantization(model_id, target_size):
     quantized_model_dir = model_id.split("/")[-1] + "-fp8-dynamic"
+=======
+def test_dynamic_quantization():
+    model_id = "facebook/opt-125m"
+    quantized_model_dir = "opt-125m-fp8-dynamic"
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
 
     quantize_config = BaseQuantizeConfig(
         quant_method="fp8", activation_scheme="dynamic"
@@ -30,6 +39,7 @@ def test_dynamic_quantization(model_id, target_size):
     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
     shutil.rmtree(quantized_model_dir)
 
+<<<<<<< HEAD
     # We expect the quantized model to be a certain size
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size
@@ -38,6 +48,16 @@ def test_dynamic_quantization(model_id, target_size):
 @pytest.mark.parametrize("model_id,target_size", MODELS)
 def test_static_quantization(model_id, target_size):
     quantized_model_dir = model_id.split("/")[-1] + "-fp8-static"
+=======
+    # We expect the model to be < 160MB
+    target_size = 160 * (1024 * 1024)
+    assert model_size < target_size
+
+
+def test_static_quantization():
+    model_id = "facebook/opt-125m"
+    quantized_model_dir = "opt-125m-fp8-static"
+>>>>>>> 3ee9283 (Support calibrating kv cache scales)
 
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
     examples = ["auto-fp8 is an easy-to-use model quantization library"]
@@ -96,3 +116,39 @@ def test_kv_cache_static_quantization(model_id, target_size):
     # We expect the quantized model to be a certain size
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size
+
+
+def test_kv_cache_static_quantization():
+    model_id = "facebook/opt-125m"
+    quantized_model_dir = "opt-125m-fp8-static-kv"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+    examples = ["auto-fp8 is an easy-to-use model quantization library"]
+    examples = tokenizer(examples, return_tensors="pt")
+
+    quantize_config = BaseQuantizeConfig(
+        quant_method="fp8",
+        activation_scheme="static",
+        kv_cache_quant_targets=("k_proj", "v_proj"),
+    )
+
+    model = AutoFP8ForCausalLM.from_pretrained(model_id, quantize_config)
+    model.model.to("cpu")
+
+    model.quantize(examples)
+    model.save_quantized(quantized_model_dir)
+
+    tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors")
+    count_matches = 0
+    for name, tensor in tensors.items():
+        if name.endswith("k_proj.output_scale") or name.endswith("v_proj.output_scale"):
+            count_matches += 1
+    assert count_matches == 24
+
+    # Measure checkpoint size and cleanup
+    model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
+    shutil.rmtree(quantized_model_dir)
+
+    # We expect the model to be < 160MB
+    target_size = 160 * (1024 * 1024)
+    assert model_size < target_size