Attn pattern shape (#1029)

bryce13950 · degenfabian · web-flow · commit 4bed46d03d41 · 2025-08-26T17:25:50.000+02:00
* Move QKV separation to bridge that directly wraps QKV matrix

* Fix typing issues

* Fix hook collection issues

* Ensuring standardized hook shape

* Fix syntax error

* Run CI again

* adjust test to reflect new hook names in qkv bridge

* simplify getattr in base component

* Add parameter for conversion rule of hook_in and hook_out in qkvbridge

* moved hook point wrapper, and added more test coverage

* matched attn pattern shape to hooked transformer

* revised hook pattern application

* updated outdated test

---------

Co-authored-by: degenfabian &lt;fabian.degen@tuta.com&gt;
Co-authored-by: Fabian Degen &lt;106864199+degenfabian@users.noreply.github.com&gt;
diff --git a/tests/integration/model_bridge/test_bridge_integration.py b/tests/integration/model_bridge/test_bridge_integration.py
@@ -249,7 +249,7 @@ def test_attention_pattern_hook_shape_custom_conversion():
 
 
 def test_attention_pattern_hook_shape():
-    """Test that the attention pattern hook produces the correct shape (batch, n_heads, pos, pos)."""
+    """Test that the attention pattern hook produces the correct shape (n_heads, pos, pos)."""
     model_name = "gpt2"  # Use a smaller model for testing
     bridge = TransformerBridge.boot_transformers(
         model_name,
@@ -289,15 +289,14 @@ def capture_pattern_hook(tensor, hook):
         # Get the captured pattern tensor
         pattern_tensor = list(captured_patterns.values())[0]
 
-        # Verify the shape is (batch, n_heads, pos, pos)
+        # Verify the shape is (n_heads, pos, pos) - attention patterns should not have batch dimension
         assert (
-            len(pattern_tensor.shape) == 4
-        ), f"Pattern tensor should be 4D, got {len(pattern_tensor.shape)}D"
+            len(pattern_tensor.shape) == 3
+        ), f"Pattern tensor should be 3D, got {len(pattern_tensor.shape)}D"
 
-        batch_dim, n_heads_dim, pos_q_dim, pos_k_dim = pattern_tensor.shape
+        n_heads_dim, pos_q_dim, pos_k_dim = pattern_tensor.shape
 
         # Verify dimensions make sense
-        assert batch_dim == batch_size, f"Batch dimension should be {batch_size}, got {batch_dim}"
         assert (
             n_heads_dim == bridge.cfg.n_heads
         ), f"Heads dimension should be {bridge.cfg.n_heads}, got {n_heads_dim}"
diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py
@@ -905,6 +905,7 @@ def run_with_hooks(
         return_type: Optional[str] = "logits",
         names_filter: Optional[Union[str, List[str], Callable[[str], bool]]] = None,
         stop_at_layer: Optional[int] = None,
+        remove_batch_dim: bool = False,
         **kwargs,
     ) -> Any:
         """Run the model with specified forward and backward hooks.
@@ -918,6 +919,7 @@ def run_with_hooks(
             return_type: What to return ("logits", "loss", etc.)
             names_filter: Filter for hook names (not used directly, for compatibility)
             stop_at_layer: Layer to stop at (not yet fully implemented)
+            remove_batch_dim: Whether to remove batch dimension from hook inputs (only works for batch_size==1)
             **kwargs: Additional arguments
 
         Returns:
@@ -958,6 +960,24 @@ def apply_hooks(hooks: List[Tuple[Union[str, Callable], Callable]], is_fwd: bool
             aliases = collect_aliases_recursive(self)
 
             for hook_name_or_filter, hook_fn in hooks:
+                # Wrap the hook function to handle remove_batch_dim if needed
+                if remove_batch_dim:
+                    original_hook_fn = hook_fn
+
+                    def wrapped_hook_fn(tensor, hook):
+                        # Remove batch dimension if it's size 1
+                        if tensor.shape[0] == 1:
+                            tensor_no_batch = tensor.squeeze(0)
+                            result = original_hook_fn(tensor_no_batch, hook)
+                            # Add batch dimension back if result doesn't have it
+                            if result.dim() == tensor_no_batch.dim():
+                                result = result.unsqueeze(0)
+                            return result
+                        else:
+                            return original_hook_fn(tensor, hook)
+
+                    hook_fn = wrapped_hook_fn
+
                 if isinstance(hook_name_or_filter, str):
                     # Direct hook name - check for aliases first
                     hook_dict = self.hook_dict
diff --git a/transformer_lens/model_bridge/generalized_components/attention.py b/transformer_lens/model_bridge/generalized_components/attention.py
@@ -6,36 +6,56 @@
 from typing import Any, Dict, Optional, Tuple
 
 import torch
+import torch.nn.functional as F
 
 from transformer_lens.conversion_utils.conversion_steps.attention_auto_conversion import (
     AttentionAutoConversion,
 )
 from transformer_lens.conversion_utils.conversion_steps.base_hook_conversion import (
     BaseHookConversion,
 )
-from transformer_lens.conversion_utils.conversion_steps.rearrange_hook_conversion import (
-    RearrangeHookConversion,
-)
 from transformer_lens.hook_points import HookPoint
 from transformer_lens.model_bridge.generalized_components.base import (
     GeneralizedComponent,
 )
 
 
+class AttentionPatternConversion(BaseHookConversion):
+    """Custom conversion rule for attention patterns that always removes batch dimension."""
+
+    def handle_conversion(self, tensor: torch.Tensor, *args) -> torch.Tensor:
+        """Convert attention pattern tensor to standard shape [n_heads, pos, pos].
+
+        Args:
+            tensor: Input tensor with shape [batch, n_heads, pos, pos] or [n_heads, pos, pos]
+            *args: Additional context arguments (ignored)
+
+        Returns:
+            Tensor with shape [n_heads, pos, pos]
+        """
+        if tensor.dim() == 4:
+            # Remove batch dimension if present
+            return tensor.squeeze(0)
+        elif tensor.dim() == 3:
+            # Already in correct shape
+            return tensor
+        else:
+            raise ValueError(f"Unexpected tensor shape for attention pattern: {tensor.shape}")
+
+
 class AttentionBridge(GeneralizedComponent):
     """Bridge component for attention layers.
 
-    This component wraps attention layers from different architectures and provides
-    a standardized interface for hook registration and execution.
+    This component handles the conversion between Hugging Face attention layers
+    and TransformerLens attention components.
     """
 
     hook_aliases = {
-        "hook_result": "hook_hidden_states",
-        "hook_attn_scores": "o.hook_in",
+        "hook_result": "hook_out",
         "hook_q": "q.hook_out",
         "hook_k": "k.hook_out",
         "hook_v": "v.hook_out",
-        "hook_z": "o.hook_out",
+        "hook_z": "hook_hidden_states",
     }
 
     property_aliases = {
@@ -65,7 +85,7 @@ def __init__(
             submodules: Dictionary of submodules to register (e.g., q_proj, k_proj, etc.)
             conversion_rule: Optional conversion rule. If None, AttentionAutoConversion will be used
             pattern_conversion_rule: Optional conversion rule for attention patterns. If None,
-                                   uses default RearrangeHookConversion to reshape to (batch, n_heads, pos, pos)
+                                   uses AttentionPatternConversion to ensure [n_heads, pos, pos] shape
         """
         # Set up conversion rule - use AttentionAutoConversion if None
         if conversion_rule is None:
@@ -74,8 +94,11 @@ def __init__(
         super().__init__(
             name, config=config, submodules=submodules or {}, conversion_rule=conversion_rule
         )
-        self.hook_hidden_states = HookPoint()
+
+        # Create only the hook points that are actually used for attention processing
+        self.hook_attn_scores = HookPoint()
         self.hook_pattern = HookPoint()
+        self.hook_hidden_states = HookPoint()
 
         # Apply conversion rule to attention-specific hooks
         self.hook_hidden_states.hook_conversion = conversion_rule
@@ -84,29 +107,196 @@ def __init__(
         if pattern_conversion_rule is not None:
             pattern_conversion = pattern_conversion_rule
         else:
-            # Create default conversion rule for attention patterns - reshape to (batch, n_heads, pos, pos)
-            # This assumes the input is (batch, n_heads, seq_len, seq_len) or similar
-            pattern_conversion = RearrangeHookConversion(
-                "batch n_heads pos_q pos_k -> batch n_heads pos_q pos_k"
-            )
+            # Use custom conversion rule that always removes batch dimension
+            pattern_conversion = AttentionPatternConversion()
 
         self.hook_pattern.hook_conversion = pattern_conversion
 
+        # Store intermediate values for pattern creation
+        self._attn_scores = None
+        self._pattern = None
+
     def _process_output(self, output: Any) -> Any:
         """Process the output from the original component.
 
+        This method intercepts the output to create attention patterns
+        the same way as the old implementation.
+
         Args:
             output: Raw output from the original component
 
         Returns:
             Processed output with hooks applied
         """
+        # Extract attention scores from the output
+        attn_scores = self._extract_attention_scores(output)
+
+        if attn_scores is not None:
+            # Create attention pattern the same way as old implementation
+            attn_scores = self.hook_attn_scores(attn_scores)
+            pattern = F.softmax(attn_scores, dim=-1)
+            if not isinstance(pattern, torch.Tensor):
+                raise TypeError(f"Expected 'pattern' to be a Tensor, got {type(pattern)}")
+            pattern = torch.where(torch.isnan(pattern), torch.zeros_like(pattern), pattern)
+            pattern = self.hook_pattern(pattern)  # [batch, head_index, query_pos, key_pos]
+
+            # Store the pattern for potential use in result calculation
+            self._pattern = pattern
+
+            # Apply the pattern to the output if needed
+            output = self._apply_pattern_to_output(output, pattern)
+
+        return output
+
+    def _extract_attention_scores(self, output: Any) -> Optional[torch.Tensor]:
+        """Extract attention scores from the output.
+
+        Args:
+            output: Output from the original component
+
+        Returns:
+            Attention scores tensor or None if not found
+        """
         if isinstance(output, tuple):
-            return self._process_tuple_output(output)
+            # Look for attention scores in tuple output
+            for element in output:
+                if isinstance(element, torch.Tensor) and element.dim() == 4:
+                    # Assume 4D tensor is attention scores [batch, heads, query_pos, key_pos]
+                    return element
         elif isinstance(output, dict):
-            return self._process_dict_output(output)
+            # Look for attention scores in dict output
+            for key in ["attentions", "attention_weights", "attention_scores"]:
+                if key in output and isinstance(output[key], torch.Tensor):
+                    return output[key]
+
+        return None
+
+    def _apply_pattern_to_output(self, output: Any, pattern: torch.Tensor) -> Any:
+        """Apply the attention pattern to the output.
+
+        This method simulates how the old implementation uses the pattern
+        to calculate the final output.
+
+        Args:
+            output: Original output from the component
+            pattern: Attention pattern tensor
+
+        Returns:
+            Modified output with pattern applied
+        """
+        if isinstance(output, tuple):
+            return self._apply_pattern_to_tuple_output(output, pattern)
+        elif isinstance(output, dict):
+            return self._apply_pattern_to_dict_output(output, pattern)
         else:
-            return self._process_single_output(output)
+            return self._apply_pattern_to_single_output(output, pattern)
+
+    def _apply_pattern_to_tuple_output(
+        self, output: Tuple[Any, ...], pattern: torch.Tensor
+    ) -> Tuple[Any, ...]:
+        """Apply pattern to tuple output.
+
+        Args:
+            output: Tuple output from attention
+            pattern: Attention pattern tensor
+
+        Returns:
+            Processed tuple with pattern applied
+        """
+        processed_output = []
+
+        for i, element in enumerate(output):
+            if i == 0:  # First element is typically hidden states
+                if element is not None:
+                    element = self._apply_hook_preserving_structure(
+                        element, self.hook_hidden_states
+                    )
+                    # Apply the pattern to the hidden states
+                    element = self._apply_pattern_to_hidden_states(element, pattern)
+            elif i == 1 or i == 2:  # Attention weights indices
+                if isinstance(element, torch.Tensor):
+                    # Replace with our computed pattern
+                    element = pattern
+            processed_output.append(element)
+
+        # Apply the main hook_out to the first element (hidden states) if it exists
+        if len(processed_output) > 0 and processed_output[0] is not None:
+            processed_output[0] = self._apply_hook_preserving_structure(
+                processed_output[0], self.hook_out
+            )
+
+        return tuple(processed_output)
+
+    def _apply_pattern_to_dict_output(
+        self, output: Dict[str, Any], pattern: torch.Tensor
+    ) -> Dict[str, Any]:
+        """Apply pattern to dictionary output.
+
+        Args:
+            output: Dictionary output from attention
+            pattern: Attention pattern tensor
+
+        Returns:
+            Processed dictionary with pattern applied
+        """
+        processed_output = {}
+
+        for key, value in output.items():
+            if key in ["last_hidden_state", "hidden_states"] and value is not None:
+                value = self._apply_hook_preserving_structure(value, self.hook_hidden_states)
+                # Apply the pattern to the hidden states
+                value = self._apply_pattern_to_hidden_states(value, pattern)
+            elif key in ["attentions", "attention_weights"] and value is not None:
+                # Replace with our computed pattern
+                value = pattern
+            processed_output[key] = value
+
+        # Apply hook_hidden_states and hook_out to the main output (usually hidden_states)
+        main_key = next((k for k in output.keys() if "hidden" in k.lower()), None)
+        if main_key and main_key in processed_output:
+            processed_output[main_key] = self._apply_hook_preserving_structure(
+                processed_output[main_key], self.hook_out
+            )
+
+        return processed_output
+
+    def _apply_pattern_to_single_output(
+        self, output: torch.Tensor, pattern: torch.Tensor
+    ) -> torch.Tensor:
+        """Apply pattern to single tensor output.
+
+        Args:
+            output: Single tensor output from attention
+            pattern: Attention pattern tensor
+
+        Returns:
+            Processed tensor with pattern applied
+        """
+        # Apply hooks for single tensor output
+        output = self._apply_hook_preserving_structure(output, self.hook_hidden_states)
+        # Apply the pattern to the output
+        output = self._apply_pattern_to_hidden_states(output, pattern)
+        output = self._apply_hook_preserving_structure(output, self.hook_out)
+        return output
+
+    def _apply_pattern_to_hidden_states(
+        self, hidden_states: torch.Tensor, pattern: torch.Tensor
+    ) -> torch.Tensor:
+        """Apply attention pattern to hidden states.
+
+        This simulates the old implementation's calculate_z_scores method.
+
+        Args:
+            hidden_states: Hidden states tensor
+            pattern: Attention pattern tensor
+
+        Returns:
+            Modified hidden states with pattern applied
+        """
+        # This is a simplified version - in the real implementation,
+        # we would need to extract V from the original component and apply
+        # the pattern properly. For now, we just apply the pattern as a hook.
+        return self.hook_hidden_states(hidden_states)
 
     def _process_tuple_output(self, output: Tuple[Any, ...]) -> Tuple[Any, ...]:
         """Process tuple output from attention layer.
@@ -202,8 +392,8 @@ def _apply_hook_preserving_structure(self, element: Any, hook_fn) -> Any:
             if isinstance(element[0], torch.Tensor):
                 processed_elements[0] = hook_fn(element[0])
             return tuple(processed_elements)
-        # For other types, return as-is
-        return element
+        else:
+            return element
 
     def forward(self, *args: Any, **kwargs: Any) -> Any:
         """Forward pass through the attention layer.
@@ -237,8 +427,6 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         # Process output
         output = self._process_output(output)
 
-        # Update hook outputs for debugging/inspection
-
         return output
 
     def get_attention_weights(self) -> Optional[torch.Tensor]: