Refactor ObservedAttention (#166)

SimJeg · web-flow · commit f5d640faf5ad · 2025-12-12T11:27:50.000+01:00
diff --git a/kvpress/pipeline.py b/kvpress/pipeline.py
@@ -15,7 +15,6 @@
 from kvpress.presses.decoding_press import DecodingPress
 from kvpress.presses.finch_press import FinchPress
 from kvpress.presses.key_rerotation_press import KeyRerotationPress
-from kvpress.presses.observed_attention_press import ObservedAttentionPress
 from kvpress.presses.prefill_decoding_press import PrefillDecodingPress
 
 logger = logging.getLogger(__name__)
@@ -210,7 +209,6 @@ def _forward(
             self.model.model(
                 input_ids=context_ids,
                 past_key_values=cache,
-                output_attentions=self.output_attentions(press),
             )
 
             logger.debug(f"Context Length: {context_length}")
@@ -306,13 +304,6 @@ def generate_answer(
         answer = self.tokenizer.decode(torch.stack(generated_ids), skip_special_tokens=True)
         return answer
 
-    def output_attentions(self, press: BasePress):
-        if isinstance(press, ObservedAttentionPress):
-            return True
-        if hasattr(press, "press") and isinstance(press.press, ObservedAttentionPress):
-            return True
-        return False
-
     def postprocess(self, model_outputs, single_question):
         if single_question:
             return {"answer": model_outputs[0]}
diff --git a/kvpress/presses/observed_attention_press.py b/kvpress/presses/observed_attention_press.py
@@ -2,16 +2,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
-import logging
 from dataclasses import dataclass
 
 import torch
 from torch import nn
 
 from kvpress.presses.scorer_press import ScorerPress
 
-logger = logging.getLogger(__name__)
-
 
 @dataclass
 class ObservedAttentionPress(ScorerPress):
@@ -22,27 +19,17 @@ class ObservedAttentionPress(ScorerPress):
     forward pass. Score for each key-value pair is the average attention weight
     it receives from all query tokens.
 
-    Requires: output_attentions=True and attn_implementation="eager".
+    Requires: attn_implementation="eager".
 
     Related to H2O (https://arxiv.org/abs/2306.14048).
 
     Parameters
     ----------
     compression_ratio : float, default=0.0
         Fraction of key-value pairs to remove during compression.
-    output_attentions : bool, default=True
-        Whether to output the attention weights. Must be set True but we keep it for backward compatibility.
     """
 
     compression_ratio: float = 0.0
-    output_attentions: bool = True
-
-    def __post_init__(self):
-        if not self.output_attentions:
-            # keep for backward compatibility, remove in version 1.0
-            raise ValueError(
-                "With transformers >= 4.54, " "ObservedAttentionPress will only work with output_attentions=True"
-            )
 
     def score(
         self,
@@ -53,7 +40,7 @@ def score(
         attentions: torch.Tensor,
         kwargs,
     ) -> torch.Tensor:
-        assert attentions is not None, 'Set output_attentions=True and attn_implementation="eager" to use this hook'
+        assert attentions is not None, 'Set attn_implementation="eager" to use this hook'
         scores = attentions.sum(2)
         bsz, num_key_value_heads, n_tokens, _ = keys.shape
         n_tokens_in_sum = torch.arange(n_tokens, 0, -1).to(attentions.device, attentions.dtype)
diff --git a/tests/fixtures.py b/tests/fixtures.py
@@ -21,7 +21,7 @@ def unit_test_model():
 @pytest.fixture(scope="session")
 def unit_test_model_output_attention():
     model = AutoModelForCausalLM.from_pretrained(
-        "MaxJeblick/llama2-0b-unit-test", attn_implementation="eager", output_attentions=True
+        "MaxJeblick/llama2-0b-unit-test", attn_implementation="eager"
     ).eval()
     return model.to(get_device())