Switch from output_scale to kv_scale

mgoin · mgoin · commit 0249168fa7a2 · 2024-07-18T17:08:17.000-04:00
diff --git a/auto_fp8/modeling.py b/auto_fp8/modeling.py
@@ -28,11 +28,15 @@ def __init__(
         )
 
         if quantize_config.kv_cache_quant_targets:
+<<<<<<< HEAD
 <<<<<<< HEAD
             kv_cache_quant_layers = get_kv_cache_quant_layers(
 =======
             kv_cache_quant_layers = get_kv_cache_quant_layer(
 >>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+            kv_cache_quant_layers = get_kv_cache_quant_layers(
+>>>>>>> c3acdee (Switch from output_scale to kv_scale)
                 self.model, quantize_config.kv_cache_quant_targets
             )
             if len(kv_cache_quant_layers) == 0:
@@ -182,20 +186,26 @@ def get_layers_to_ignore(model, ignore_patterns) -> List[str]:
     return list(ignored_layers)
 
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
     kv_cache_quant_layers = []
 =======
 def get_kv_cache_quant_layer(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
     kv_cache_quant_layers = set()
 >>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
+    kv_cache_quant_layers = []
+>>>>>>> c3acdee (Switch from output_scale to kv_scale)
 
     for name, linear in model.named_modules():
         if not isinstance(linear, torch.nn.Linear):
             continue
 
         for output_quant_target in kv_cache_quant_targets:
             if name.endswith(output_quant_target):
+<<<<<<< HEAD
 <<<<<<< HEAD
                 kv_cache_quant_layers.append(name)
 
@@ -205,3 +215,8 @@ def get_kv_cache_quant_layer(model, kv_cache_quant_targets: Tuple[str]) -> List[
 
     return list(kv_cache_quant_layers)
 >>>>>>> 3ee9283 (Support calibrating kv cache scales)
+=======
+                kv_cache_quant_layers.append(name)
+
+    return kv_cache_quant_layers
+>>>>>>> c3acdee (Switch from output_scale to kv_scale)
diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py
@@ -185,9 +185,9 @@ def __init__(
     def forward(self, x):
         qinput, x_input_scale = per_tensor_quantize(x)
         if self.input_scale is None:
-            self.input_scale = torch.nn.Parameter(x_input_scale)
+            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
         elif x_input_scale > self.input_scale:
-            self.input_scale = torch.nn.Parameter(x_input_scale)
+            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
         output = fp8_gemm(
             A=qinput,
             A_scale=self.input_scale,
@@ -201,9 +201,9 @@ def forward(self, x):
         if self.quantize_output:
             qoutput, output_scale = per_tensor_quantize(output)
             if self.output_scale is None:
-                self.output_scale = torch.nn.Parameter(output_scale)
+                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
             elif output_scale > self.output_scale:
-                self.output_scale = torch.nn.Parameter(output_scale)
+                self.output_scale = torch.nn.Parameter(output_scale, requires_grad=False)
             output = qoutput.to(output.dtype) * output_scale
 
         return output
diff --git a/tests/test_auto_fp8.py b/tests/test_auto_fp8.py
@@ -64,6 +64,9 @@ def test_dynamic_quantization(model_id, target_size):
 
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> c3acdee (Switch from output_scale to kv_scale)
     # We expect the quantized model to be a certain size
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size
@@ -114,6 +117,7 @@ def test_static_quantization(model_id, target_size):
     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
     shutil.rmtree(quantized_model_dir)
 
+<<<<<<< HEAD
 <<<<<<< HEAD
     # We expect the quantized model to be a certain size
     target_size = target_size * (1024 * 1024)
@@ -157,6 +161,9 @@ def test_kv_cache_static_quantization(model_id, target_size):
 =======
     # We expect the model to be < 160MB
 >>>>>>> 415c0b7 (Add fixed target sizes)
+=======
+    # We expect the quantized model to be a certain size
+>>>>>>> c3acdee (Switch from output_scale to kv_scale)
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size
 
@@ -182,18 +189,18 @@ def test_kv_cache_static_quantization(model_id, target_size):
 
     tensors = safetensors.torch.load_file(f"{quantized_model_dir}/model.safetensors")
     proj_linear_count = 0
-    output_scale_count = 0
+    kv_scale_count = 0
     for name, _ in tensors.items():
         if name.endswith("k_proj.weight") or name.endswith("v_proj.weight"):
             proj_linear_count += 1
-        if name.endswith("k_proj.output_scale") or name.endswith("v_proj.output_scale"):
-            output_scale_count += 1
-    assert proj_linear_count == output_scale_count
+        if name.endswith("kv_scale"):
+            kv_scale_count += 1
+    assert proj_linear_count // 2 == kv_scale_count
 
     # Measure checkpoint size and cleanup
     model_size = os.path.getsize(f"{quantized_model_dir}/model.safetensors")
     shutil.rmtree(quantized_model_dir)
 
-    # We expect the model to be < 160MB
+    # We expect the quantized model to be a certain size
     target_size = target_size * (1024 * 1024)
     assert model_size < target_size