Improve hook compatibility

degenfabian · degenfabian · commit 3a6f5962fcef · 2025-09-14T16:19:10.000+02:00
diff --git a/transformer_lens/model_bridge/generalized_components/attention.py b/transformer_lens/model_bridge/generalized_components/attention.py
@@ -6,7 +6,6 @@
 from typing import Any, Dict, Optional, Tuple
 
 import torch
-import torch.nn.functional as F
 
 from transformer_lens.conversion_utils.conversion_steps.attention_auto_conversion import (
     AttentionAutoConversion,
@@ -20,29 +19,6 @@
 )
 
 
-class AttentionPatternConversion(BaseHookConversion):
-    """Custom conversion rule for attention patterns that always removes batch dimension."""
-
-    def handle_conversion(self, tensor: torch.Tensor, *args) -> torch.Tensor:
-        """Convert attention pattern tensor to standard shape [n_heads, pos, pos].
-
-        Args:
-            tensor: Input tensor with shape [batch, n_heads, pos, pos] or [n_heads, pos, pos]
-            *args: Additional context arguments (ignored)
-
-        Returns:
-            Tensor with shape [n_heads, pos, pos]
-        """
-        if tensor.dim() == 4:
-            # Remove batch dimension if present
-            return tensor.squeeze(0)
-        elif tensor.dim() == 3:
-            # Already in correct shape
-            return tensor
-        else:
-            raise ValueError(f"Unexpected tensor shape for attention pattern: {tensor.shape}")
-
-
 class AttentionBridge(GeneralizedComponent):
     """Bridge component for attention layers.
 
@@ -55,7 +31,7 @@ class AttentionBridge(GeneralizedComponent):
         "hook_q": "q.hook_out",
         "hook_k": "k.hook_out",
         "hook_v": "v.hook_out",
-        "hook_z": "hook_hidden_states",
+        "hook_z": "o.hook_in",
     }
 
     property_aliases = {
@@ -103,14 +79,9 @@ def __init__(
         # Apply conversion rule to attention-specific hooks
         self.hook_hidden_states.hook_conversion = conversion_rule
 
-        # Set up pattern conversion rule - use provided rule or create default
+        # Set up pattern conversion rule if provided
         if pattern_conversion_rule is not None:
-            pattern_conversion = pattern_conversion_rule
-        else:
-            # Use custom conversion rule that always removes batch dimension
-            pattern_conversion = AttentionPatternConversion()
-
-        self.hook_pattern.hook_conversion = pattern_conversion
+            self.hook_pattern.hook_conversion = pattern_conversion_rule
 
         # Store intermediate values for pattern creation
         self._attn_scores = None
@@ -129,24 +100,22 @@ def _process_output(self, output: Any) -> Any:
             Processed output with hooks applied
         """
         # Extract attention scores from the output
-        attn_scores = self._extract_attention_scores(output)
+        attn_pattern = self._extract_attention_pattern(output)
+
+        if attn_pattern is not None:
+            if not isinstance(attn_pattern, torch.Tensor):
+                raise TypeError(f"Expected 'pattern' to be a Tensor, got {type(attn_pattern)}")
 
-        if attn_scores is not None:
             # Create attention pattern the same way as old implementation
-            attn_scores = self.hook_attn_scores(attn_scores)
-            pattern = F.softmax(attn_scores, dim=-1)
-            if not isinstance(pattern, torch.Tensor):
-                raise TypeError(f"Expected 'pattern' to be a Tensor, got {type(pattern)}")
-            pattern = torch.where(torch.isnan(pattern), torch.zeros_like(pattern), pattern)
-            pattern = self.hook_pattern(pattern)  # [batch, head_index, query_pos, key_pos]
+            attn_pattern = self.hook_pattern(attn_pattern)
 
             # Store the pattern for potential use in result calculation
-            self._pattern = pattern
+            self._pattern = attn_pattern
 
             # Apply the pattern to the output if needed
-            output = self._apply_pattern_to_output(output, pattern)
+            output = self._apply_pattern_to_output(output, attn_pattern)
         else:
-            # If no attention scores found, still apply hooks to the output
+            # If no attention pattern found, still apply hooks to the output
             if isinstance(output, tuple):
                 output = self._process_tuple_output(output)
             elif isinstance(output, dict):
@@ -159,24 +128,24 @@ def _process_output(self, output: Any) -> Any:
 
         return output
 
-    def _extract_attention_scores(self, output: Any) -> Optional[torch.Tensor]:
-        """Extract attention scores from the output.
+    def _extract_attention_pattern(self, output: Any) -> Optional[torch.Tensor]:
+        """Extract attention pattern from the output.
 
         Args:
             output: Output from the original component
 
         Returns:
-            Attention scores tensor or None if not found
+            Attention pattern tensor or None if not found
         """
         if isinstance(output, tuple):
-            # Look for attention scores in tuple output
+            # Look for attention pattern in tuple output
             for element in output:
                 if isinstance(element, torch.Tensor) and element.dim() == 4:
-                    # Assume 4D tensor is attention scores [batch, heads, query_pos, key_pos]
+                    # Assume 4D tensor is attention pattern [batch, heads, query_pos, key_pos]
                     return element
         elif isinstance(output, dict):
-            # Look for attention scores in dict output
-            for key in ["attentions", "attention_weights", "attention_scores"]:
+            # Look for attention pattern in dict output
+            for key in ["attentions", "attention_weights", "attention_scores", "attn_weights"]:
                 if key in output and isinstance(output[key], torch.Tensor):
                     return output[key]
 
diff --git a/transformer_lens/model_bridge/generalized_components/block.py b/transformer_lens/model_bridge/generalized_components/block.py
@@ -23,7 +23,7 @@ class BlockBridge(GeneralizedComponent):
 
     hook_aliases = {
         "hook_resid_pre": "hook_in",
-        "hook_resid_mid": "attn.hook_out",
+        "hook_resid_mid": "ln2.hook_in",
         "hook_resid_post": "hook_out",
         "hook_attn_in": "attn.hook_in",
         "hook_attn_out": "attn.hook_out",