Update FastGen to use new attention loss calculation.

Griffin Adams · Griffin Adams · commit 12be67cb09d3 · 2024-07-03T22:40:43.000Z
diff --git a/cache.py b/cache.py
@@ -57,7 +57,7 @@ def add_cache_arguments(parser: argparse.ArgumentParser):
     group.add_argument(
         "--recent_window",  # NB: for KVCacheWindow, recent_window is implicitly set to self.max_cache_length - self.global_tokens.
         default=10,  # 10 is default specified in ScissorHands paper ("r" in Algorithm 2).
-        type=int,
+        type=float,  # If < 1, it is a fraction of max_cache_length.
         help="The number of recently generated tokens to always spare from eviction.",
     )
 
@@ -848,14 +848,10 @@ def __init__(
         kv_mask_shape = (max_batch_size, n_heads, 1, self.max_cache_length)
         self.register_buffer("mask", torch.zeros(kv_mask_shape, dtype=torch.bool))
 
-        self.epsilon = (
-            1e-4  # Max difference between attention probs to be considered equivalent.
-        )
-
         # NB: Kwargs are sdpa attention kwargs, not the kwargs for the "func"
         self.prefill_attn_callback = {
             "func": self.profile_and_update,
-            "kwargs": {"return_attn_logits": True},
+            "kwargs": {"return_attn_logits": False},
         }
 
     def return_attn(self):
@@ -1006,25 +1002,6 @@ def build_punc_ids_mask(self, input_ids):
         punc_ids_mask = torch.isin(input_ids, self.punc_ids)
         return punc_ids_mask
 
-    def compute_remasked_attn(self, attn, masks):
-        """
-        Compute the attention with the masks applied. Mask should be true for tokens we want to keep.
-        """
-        num_masks = masks.shape[0]
-        attn = attn.expand(num_masks, -1, -1, -1)
-        return torch.softmax(attn.masked_fill(~masks, float("-inf")), dim=-1)
-
-    def recovery_percent(self, attn, compressed_attn):
-        assert (
-            attn.shape[-2] == attn.shape[-1]
-        ), "Attention matrix expected to be square for profiling."
-        num_causal = attn.shape[-1] * (attn.shape[-1] + 1) // 2
-        num_padding = num_causal - attn.shape[-1]  # Subtract the trace
-        return (
-            (torch.abs(attn - compressed_attn) < self.epsilon).sum(dim=-1).sum(dim=-1)
-            - num_padding
-        ) / num_causal
-
     def profile_attn_heads(self, input_pos, input_ids, attn):
         input_ids = input_ids.squeeze(0)
         seq_len = input_ids.shape[-1]
@@ -1078,10 +1055,9 @@ def profile_attn_heads(self, input_pos, input_ids, attn):
             ]
         )
 
-        compressed_attns = self.compute_remasked_attn(attn, masks)
-        compressed_scores = self.recovery_percent(
-            compressed_attns, compressed_attns[-1]
-        )
+        attn_rep = attn.expand(masks.shape[0], -1, -1, -1)
+
+        compressed_scores = attn_rep.masked_fill(~masks, 0).sum(dim=-1).mean(dim=-1)
 
         # For each column, return the first row which has cost >= min_recovery_frac
         cache_strategies = (
diff --git a/cache_configs/fastgen.yaml b/cache_configs/fastgen.yaml
@@ -6,4 +6,5 @@ history_window_size: 400  # How many past steps to consider for attention import
 drop_amount: 0  # How frequently to calculate which tokens to evict (0 means we recalculate every step)
 attn_thresholding: False  # Whether to threshold attention scores or record raw probabilities
 min_recovery_frac: 0.85  # Higher is less compression (0.85 means we choose the policy which compresses the most tokens AND recovers 85% of the full attention matrix)
-heavy_hitter_frac: 0.3  # Higher is less compression for the heavy hitter strategy
+heavy_hitter_frac: 0.3  # Higher is less compression for the heavy hitter strategy
+recent_window: 0.3
diff --git a/generation_utils.py b/generation_utils.py
@@ -200,6 +200,7 @@ def setup_caches(
             cache_kwargs["max_cache_length"],
         )
     )
+
     assert (
         model.config.n_layer % len(cache_kwargs["max_cache_length"]) == 0
     ), f'max_cache_length ({len(cache_kwargs["max_cache_length"])}) must be a factor of {model.config.n_layer} layers.'
@@ -209,6 +210,18 @@ def setup_caches(
         item for item in cache_kwargs["max_cache_length"] for _ in range(tile_size)
     ]
 
+    if type(cache_kwargs["recent_window"]) != list:
+        if cache_kwargs["recent_window"] <= 1:
+            cache_kwargs["recent_window"] = [
+                max(1, int(cache_kwargs["recent_window"] * l))
+                for l in cache_kwargs["max_cache_length"]
+            ]
+        else:
+            cache_kwargs["recent_window"] = [
+                max(1, min(cache_kwargs["recent_window"], l))
+                for l in cache_kwargs["max_cache_length"]
+            ]
+
     # Gets called twice when model is wrapped in torch.compile which causes an error without the if statement
     if type(cache_kwargs["drop_amount"]) != list:
         cache_kwargs["drop_amount"] = [
diff --git a/model.py b/model.py
@@ -181,7 +181,7 @@ def setup_caches(self, **kwargs):
                 cache_strategy=cache_strategy
             )
             # Only pass in the kwargs we need for the cache we chose (useful especially for debugging)
-            layerwise_keys = {"max_cache_length", "drop_amount"}
+            layerwise_keys = {"max_cache_length", "drop_amount", "recent_window"}
             layer_kwargs = {
                 k: kwargs[k][layer_idx] if k in layerwise_keys else kwargs[k]
                 for k in relevant_kwargs

Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,7 @@ def setup_caches(self, **kwargs):`
`181`	`181`	`cache_strategy=cache_strategy`
`182`	`182`	`)`
`183`	`183`	`# Only pass in the kwargs we need for the cache we chose (useful especially for debugging)`
`184`		`- layerwise_keys = {"max_cache_length", "drop_amount"}`
	`184`	`+ layerwise_keys = {"max_cache_length", "drop_amount", "recent_window"}`
`185`	`185`	`layer_kwargs = {`
`186`	`186`	`k: kwargs[k][layer_idx] if k in layerwise_keys else kwargs[k]`
`187`	`187`	`for k in relevant_kwargs`