Switch from output_scale to kv_scale

mgoin · mgoin · commit c3acdeea440b · 2024-07-18T17:06:03.000-04:00
diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py
@@ -28,7 +28,7 @@ def __init__(
         )
 
         if quantize_config.kv_cache_quant_targets:
-            kv_cache_quant_layers = get_kv_cache_quant_layer(
+            kv_cache_quant_layers = get_kv_cache_quant_layers(
                 self.model, quantize_config.kv_cache_quant_targets
             )
             if len(kv_cache_quant_layers) == 0:
@@ -159,15 +159,15 @@ def get_layers_to_ignore(model, ignore_patterns) -> List[str]:
     return list(ignored_layers)
 
 
-def get_kv_cache_quant_layer(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
-    kv_cache_quant_layers = set()
+def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
+    kv_cache_quant_layers = []
 
     for name, linear in model.named_modules():
         if not isinstance(linear, torch.nn.Linear):
             continue
 
         for output_quant_target in kv_cache_quant_targets:
             if name.endswith(output_quant_target):
-                kv_cache_quant_layers.add(name)
+                kv_cache_quant_layers.append(name)
 
-    return list(kv_cache_quant_layers)
+    return kv_cache_quant_layers
diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py
@@ -152,9 +152,9 @@ def __init__(
     def forward(self, x):
         qinput, x_input_scale = per_tensor_quantize(x)
         if self.input_scale is None:
-            self.input_scale = torch.nn.Parameter(x_input_scale)
+            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
         elif x_input_scale > self.input_scale:
-            self.input_scale = torch.nn.Parameter(x_input_scale)
+            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
         output = fp8_gemm(
             A=qinput,
             A_scale=self.input_scale,
@@ -168,9 +168,9 @@ def forward(self, x):
         if self.quantize_output:
             qoutput, output_scale = per_tensor_quantize(output)
             if self.output_scale is None:
-                self.output_scale = torch.nn.Parameter(output_scale)
+                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
             elif output_scale > self.output_scale:
-                self.output_scale = torch.nn.Parameter(output_scale)
+                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
             output = qoutput.to(output.dtype) * output_scale
 
         return output
@@ -307,6 +307,30 @@ def quantize_activations(
         del quantizer
     cleanup_memory()
 
+    # Post-process step for kv cache scales to take the k/v module
+    # `output_scale` parameters, take the max of them, and store them in
+    # the parent attention module as `kv_scale`
+    # NOTE: if we want to switch to the `output_scale` representation, we can simply remove this block
+    if hasattr(quantize_config, "kv_cache_quant_layers"):
+        # Assumes that list is ordered such that [layer0.k_proj, layer0.v_proj, layer1.k_proj, layer1.v_proj, ...]
+        # so we make a list of tuples [(layer0.k_proj, layer0.v_proj), (layer1.k_proj, layer1.v_proj), ...]
+        kv_proj_pairs = zip(*[iter(quantize_config.kv_cache_quant_layers)]*2)
+        for k_proj_name, v_proj_name in kv_proj_pairs:
+            parent_module_name = ".".join(k_proj_name.split(".")[:-1])
+            assert parent_module_name == ".".join(v_proj_name.split(".")[:-1])
+            parent_module = dict(model.named_modules())[parent_module_name]
+
+            k_proj = dict(model.named_modules())[k_proj_name]
+            v_proj = dict(model.named_modules())[v_proj_name]
+
+            kv_scale = max(k_proj.output_scale, v_proj.output_scale)
+            parent_module.kv_scale = torch.nn.Parameter(kv_scale, requires_grad=False)
+
+            # Remove output_scale from k_proj and v_proj
+            k_proj.output_scale = None
+            v_proj.output_scale = None
+    cleanup_memory()
+
 
 def save_quantized_model(
     model: AutoModelForCausalLM,
diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py
@@ -30,7 +30,7 @@ def test_dynamic_quantization(model_id, target_size):
     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
     shutil.rmtree(quantized_model_dir)
 
-    # We expect the model to be a certain size
+    # We expect the quantized model to be a certain size
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size
 
@@ -55,7 +55,7 @@ def test_static_quantization(model_id, target_size):
     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
     shutil.rmtree(quantized_model_dir)
 
-    # We expect the model to be < 160MB
+    # We expect the quantized model to be a certain size
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size
 
@@ -81,18 +81,18 @@ def test_kv_cache_static_quantization(model_id, target_size):
 
     tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors")
     proj_linear_count = 0
-    output_scale_count = 0
+    kv_scale_count = 0
     for name, _ in tensors.items():
         if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"):
             proj_linear_count += 1
-        if name.endswith("k_proj.output_scale") or name.endswith("v_proj.output_scale"):
-            output_scale_count += 1
-    assert proj_linear_count == output_scale_count
+        if name.endswith("kv_scale"):
+            kv_scale_count += 1
+    assert proj_linear_count // 2 == kv_scale_count
 
     # Measure checkpoint size and cleanup
     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
     shutil.rmtree(quantized_model_dir)
 
-    # We expect the model to be < 160MB
+    # We expect the quantized model to be a certain size
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size