Support quantizing only kv cache (#135)

mgoin · web-flow · commit 043598643d90 · 2024-09-03T13:50:05.000-04:00
diff --git a/src/compressed_tensors/compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressor.py
@@ -271,6 +271,9 @@ def compress(
             v_proj_has_quant_output = 0
             for name, module in model.named_modules():
                 if not hasattr(module, "quantization_scheme"):
+                    # We still want to count non-quantized q_proj
+                    if name.endswith(".q_proj"):
+                        q_proj_has_no_quant_output += 1
                     continue
                 out_act = module.quantization_scheme.output_activations
                 if name.endswith(".q_proj") and out_act is None:
diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
@@ -110,6 +110,7 @@ def is_preset_scheme(name: str) -> bool:
     """
     return name.upper() in PRESET_SCHEMES
 
+UNQUANTIZED = dict()
 
 # 8 bit integer weights and 8 bit activations quantization
 W8A8 = dict(
@@ -208,6 +209,8 @@ def is_preset_scheme(name: str) -> bool:
 )
 
 PRESET_SCHEMES = {
+    # Unquantized (no-op)
+    "UNQUANTIZED": UNQUANTIZED,
     # Integer weight only schemes
     "W8A16": W8A16,
     "W4A16": W4A16,
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -181,7 +181,7 @@ def calculate_compression_ratio(model: Module) -> float:
         for parameter in model.parameters():
             uncompressed_bits = get_torch_bit_depth(parameter)
             compressed_bits = uncompressed_bits
-            if is_module_quantized(submodule):
+            if is_module_quantized(submodule) and submodule.quantization_scheme.weights:
                 compressed_bits = submodule.quantization_scheme.weights.num_bits
 
             num_weights = parameter.numel()