docstrings

kylesayrs · kylesayrs · commit 75056bf1bce7 · 2025-08-28T17:09:44.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/modeling/attention.py b/src/compressed_tensors/modeling/attention.py
@@ -15,7 +15,6 @@
 import inspect
 from typing import Callable, Optional
 
-import torch
 from compressed_tensors.modeling.kvcache import initialize_hooked_kv_cache
 from compressed_tensors.quantization import (
     QuantizationArgs,
@@ -28,30 +27,51 @@
 )
 from compressed_tensors.utils import getattr_chain
 from compressed_tensors.utils.internal import InternalModule
+from torch import Tensor
+from torch.nn import Module
 from torch.utils.hooks import RemovableHandle
 from transformers import AttentionInterface, PreTrainedModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 
-__all__ = ["IMPL_ATTR", "QuantizedAttentionImpl"]
+__all__ = [
+    "QuantizedAttentionImpl",
+    "initialize_hooked_attention",
+    "register_query_hook",
+]
 
 
 IMPL_ATTR = "impl"
-_original_impl = "eager"  # mutable
+_original_impl = "eager"  # mutable, assumes only one model at a time
 
 
 class QuantizedAttentionImpl(InternalModule):
-    def __init__(self, attn_module: torch.nn.Module):
+    """
+    QuantizedAttentionImpl module which wraps the functionality of the original
+    attention implementation. Unlike the original attention function, this
+    implementation is a `torch.nn.Module` which can be hooked to trigger
+    transforms and calibration hooks.
+
+    This module works by being registered as a submodule to attention modules via
+    `initialize_hooked_attention`, registering a new attention implementation function
+    which calls this module, then setting the model attention implementation to the new
+    function. After triggering hooks and quantization, this module calls the original
+    attention implementation function.
+
+    :param attn_module: parent attention module
+    """
+
+    def __init__(self, attn_module: Module):
         super().__init__()
         self.attn_module_container = [attn_module]  # avoid circular reference
         self._qparams_initialized = False
 
     def forward(
         self,
-        module: torch.nn.Module,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
+        module: Module,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
         *args,
         **kwargs,
     ):
@@ -72,7 +92,15 @@ def forward(
             **kwargs,
         )
 
-    def initialize_qparams_once(self, model: PreTrainedModel, module: torch.nn.Module):
+    def initialize_qparams_once(self, model: PreTrainedModel, module: Module):
+        """
+        Initialize attention quantization parameters if they have not already been
+        intialized. KV cache quantization parameters are initialized by the
+        `QuantizedKVCache`
+
+        :param model: parent model of attention module
+        :param module: attention module to initialize with
+        """
         assert module is self.attn_module_container[0]
         scheme: Optional[QuantizationScheme] = getattr(
             module, "quantization_scheme", None
@@ -86,7 +114,6 @@ def initialize_qparams_once(self, model: PreTrainedModel, module: torch.nn.Modul
             and quant_args is not None
             and not scheme.kv_cache_only
         ):
-            # TODO: use model.config.num_attention_heads to find query_size
             assert quant_args.strategy == QuantizationStrategy.TENSOR
             _initialize_scale_zero_point(module, "q", quant_args)
             self._qparams_initialized = True
@@ -95,24 +122,32 @@ def initialize_qparams_once(self, model: PreTrainedModel, module: torch.nn.Modul
 # ----- initialize ----- #
 
 
-def ct_hooked_attention(module: torch.nn.Module, *args, **kwargs):
+def _ct_hooked_attention(module: Module, *args, **kwargs):
     if hasattr(module, IMPL_ATTR):
         return module.impl(module, *args, **kwargs)
     else:
         return ALL_ATTENTION_FUNCTIONS[_original_impl](module, *args, **kwargs)
 
 
 def initialize_hooked_attention(
-    model: PreTrainedModel, module: torch.nn.Module, quantize: bool = True
+    model: PreTrainedModel, module: Module, quantize: bool = True
 ):
+    """
+    Initialize `QuantizedAttentionImpl` and `QuantizedKVCache` instances
+    attached to attention
+
+    :param model: parent model of attention module
+    :param module: attention module to initialize with
+    :param quantize: initialize attention quantization parameters
+    """
     if not hasattr(module, IMPL_ATTR):
         module.register_module(IMPL_ATTR, QuantizedAttentionImpl(module))
         if model.config._attn_implementation != "ct_hooked_attention":
             # assumes only one model at a time
             global _original_impl
             _original_impl = model.config._attn_implementation
 
-            AttentionInterface.register("ct_hooked_attention", ct_hooked_attention)
+            AttentionInterface.register("ct_hooked_attention", _ct_hooked_attention)
             model.config._attn_implementation = "ct_hooked_attention"
 
     impl: QuantizedAttentionImpl = getattr(module, IMPL_ATTR)
@@ -125,10 +160,15 @@ def initialize_hooked_attention(
 # ----- hooks ----- #
 
 
-def register_query_hook(module: torch.nn.Module, hook: Callable) -> RemovableHandle:
+def register_query_hook(
+    module: Module, hook: Callable[[Module, Tensor], Optional[Tensor]]
+) -> RemovableHandle:
     """
-    Registers a forward pre-hook on `module.impl` that replaces the `query` argument
-    with `hook(mod, query)` (handles both positional and keyword forms).
+    Register a hook which takes post-rope query states as an argument and
+    returns the modified query states or `None`
+
+    :param module: attention module to add hook to
+    :param hook: query hook function
     """
     impl = getattr(module, IMPL_ATTR)
 
diff --git a/src/compressed_tensors/modeling/kvcache.py b/src/compressed_tensors/modeling/kvcache.py
@@ -15,30 +15,46 @@
 import inspect
 from typing import Callable, Optional, Tuple
 
-import torch
-import transformers
 from compressed_tensors.quantization import QuantizationStrategy, forward_quantize
 from compressed_tensors.quantization.lifecycle.initialize import (
     _initialize_scale_zero_point,
 )
 from compressed_tensors.utils import getattr_chain
 from compressed_tensors.utils.internal import InternalModule
-from packaging import version
 from torch import Tensor
+from torch.nn import Module
 from torch.utils.hooks import RemovableHandle
 from transformers import Cache, PreTrainedModel
 
 
-__all__ = ["KV_CACHE_ATTR", "QuantizedKVCache"]
+__all__ = [
+    "QuantizedKVCache",
+    "initialize_hooked_kv_cache",
+    "register_key_hook",
+    "register_value_hook",
+]
 
 
 KV_CACHE_ATTR = "kv_cache"
 
 
 class QuantizedKVCache(InternalModule):
-    def __init__(self, attn_module: torch.nn.Module):
+    """
+    QuantizedKVCache module which wraps the functionality of any existing kvcache args.
+    Unlike transform Cache instances, this cache is a `torch.nn.Module` which can be
+    hooked to trigger transforms and calibration hooks.
+
+    This module works by being registered as a submodule to attention modules via
+    `initialize_hooked_kv_cache`, then adding a hook which replaces `past_key_values`
+    kwargs with this module. This module adopts the functionality of the replaced cache,
+    preserving caching functionality such as sliding window attention, ect.
+
+    :param attn_module: parent attention module
+    """
+
+    def __init__(self, attn_module: Module):
         super().__init__()
-        self.attn_module_container = [attn_module]  # avoid nn.Module circular reference
+        self.attn_module_container = [attn_module]  # avoid circular reference
         self.past_key_values: Optional[Cache] = None
         self._qparams_initialized = False
 
@@ -70,13 +86,19 @@ def forward(
         self.past_key_values = None
         return ret
 
-    def initialize_qparams_once(self, model: PreTrainedModel, module: torch.nn.Module):
+    def initialize_qparams_once(self, model: PreTrainedModel, module: Module):
+        """
+        Initialize kv cache quantization parameters if they have not already been
+        intialized
+
+        :param model: parent model of attention module
+        :param module: attention module to initialize with
+        """
         assert module is self.attn_module_container[0]
         scheme = getattr(module, "quantization_scheme", None)
         quant_args = getattr(scheme, "input_activations", None)
 
         if not self._qparams_initialized and quant_args is not None:
-            # TODO: use model.config.num_key_value_heads to find key_size, value_size
             assert quant_args.strategy == QuantizationStrategy.TENSOR
             _initialize_scale_zero_point(module, "k", quant_args)
             _initialize_scale_zero_point(module, "v", quant_args)
@@ -86,19 +108,7 @@ def initialize_qparams_once(self, model: PreTrainedModel, module: torch.nn.Modul
 # ----- initialize ----- #
 
 
-def initialize_hooked_kv_cache(
-    model: PreTrainedModel, module: torch.nn.Module, quantize: bool = False
-):
-    if not hasattr(module, KV_CACHE_ATTR):
-        module.register_module(KV_CACHE_ATTR, QuantizedKVCache(module))
-        module.register_forward_pre_hook(kv_cache_attention_hook, with_kwargs=True)
-
-    kv_cache: QuantizedKVCache = getattr(module, KV_CACHE_ATTR)
-    if quantize:
-        kv_cache.initialize_qparams_once(model, module)
-
-
-def kv_cache_attention_hook(module: torch.nn.Module, args, kwargs):
+def _kv_cache_attention_hook(module: Module, args, kwargs):
     kv_cache: QuantizedKVCache = getattr(module, KV_CACHE_ATTR)
     _past_kv_name = (
         "past_key_values"  # transformers#39956
@@ -111,10 +121,38 @@ def kv_cache_attention_hook(module: torch.nn.Module, args, kwargs):
     return args, kwargs
 
 
+def initialize_hooked_kv_cache(
+    model: PreTrainedModel, module: Module, quantize: bool = False
+):
+    """
+    Initialize a `QuantizedKVCache` instance attached to attention
+
+    :param model: parent model of attention module
+    :param module: attention module to initialize with
+    :param quantize: initialize kv cache quantization parameters
+    """
+    if not hasattr(module, KV_CACHE_ATTR):
+        module.register_module(KV_CACHE_ATTR, QuantizedKVCache(module))
+        module.register_forward_pre_hook(_kv_cache_attention_hook, with_kwargs=True)
+
+    kv_cache: QuantizedKVCache = getattr(module, KV_CACHE_ATTR)
+    if quantize:
+        kv_cache.initialize_qparams_once(model, module)
+
+
 # ----- hooks ----- #
 
 
-def register_key_hook(module: torch.nn.Module, hook: Callable) -> RemovableHandle:
+def register_key_hook(
+    module: Module, hook: Callable[[Module, Tensor], Optional[Tensor]]
+) -> RemovableHandle:
+    """
+    Register a hook which takes post-rope key states as an argument and
+    returns the modified key states or `None`
+
+    :param module: attention module to add hook to
+    :param hook: key hook function
+    """
     kv_cache: QuantizedKVCache = getattr(module, KV_CACHE_ATTR)
 
     def _hook(cache: QuantizedKVCache, args, kwargs):
@@ -128,7 +166,16 @@ def _hook(cache: QuantizedKVCache, args, kwargs):
     return kv_cache.register_forward_pre_hook(_hook, with_kwargs=True)
 
 
-def register_value_hook(module: torch.nn.Module, hook: Callable) -> RemovableHandle:
+def register_value_hook(
+    module: Module, hook: Callable[[Module, Tensor], Optional[Tensor]]
+) -> RemovableHandle:
+    """
+    Register a hook which takes value states as an argument and
+    returns the modified value states or `None`
+
+    :param module: attention module to add hook to
+    :param hook: value hook function
+    """
     kv_cache: QuantizedKVCache = getattr(module, KV_CACHE_ATTR)
 
     def _hook(cache: QuantizedKVCache, args, kwargs):
diff --git a/src/compressed_tensors/transform/transform_args.py b/src/compressed_tensors/transform/transform_args.py
@@ -46,6 +46,10 @@ class TransformLocation(str, Enum):
     Q_ATTN = "q_attn"
 
     def is_online(self) -> bool:
+        """
+        Returns True if the transform location is online
+        (applied at runtime), False otherwise
+        """
         return self not in (
             TransformLocation.WEIGHT_INPUT,
             TransformLocation.WEIGHT_OUTPUT,