Changes to FastGen.

Griffin Adams · Griffin Adams · commit 9c9845bfb6dc · 2024-07-03T16:11:45.000Z
diff --git a/cache.py b/cache.py
@@ -214,7 +214,7 @@ def compression_ratio(self, seq_len):
         """
         # Final token isn't passed to cache so must -1 from seq_len
         n = seq_len - 1
-        return ((n - min(self.cache_cts, self.max_cache_length)) / n).mean()
+        return ((n - torch.clamp_max(self.cache_cts, self.max_cache_length)) / n).mean()
 
     def return_kv_cache(self):
         # Truncate the cache based on number of insertions. It will be at the end since we prefill in-order.
@@ -651,10 +651,11 @@ def reset(self):
         self.attn_history_num.zero_()
         self.attn_history_denom.zero_()
         self.attn_counter.zero_()
-        self.eviction_queue.zero_()
-        # Start with an "empty queue" so that we can fill it up
-        self.eviction_idx.fill_(self.drop_amount)
-        assert self.queue_len() == 0
+        if hasattr(self, "eviction_queue"):
+            self.eviction_queue.zero_()
+            # Start with an "empty queue" so that we can fill it up
+            self.eviction_idx.fill_(self.drop_amount)
+            assert self.queue_len() == 0
 
     def queue_len(self):
         return self.drop_amount - self.eviction_idx
@@ -778,6 +779,7 @@ def __init__(
         **kwargs,
     ):
         self.global_tokens = 0  # No global tokens for FastGen
+        self.attn_record_freq = 1  # We record attention every step for FastGen
         super().__init__(
             max_batch_size,
             n_heads,
@@ -1072,6 +1074,9 @@ def profile_and_update(self, input_pos, input_ids, k_val, v_val, attn):
             self.profile_attn_heads(input_pos, input_ids, attn)
         )
 
+        # Show which strategies are selected
+        print([self.strategies[i] for i in self.cache_strategies.tolist()])
+
         # If none of the heads selected a heavy hitter strategy, we don't need to track attention weights
         self.requires_heavy_check = any(
             ["heavy" in KVCacheFastGen.strategies[i] for i in self.cache_strategies]
diff --git a/cache_configs/fastgen.yaml b/cache_configs/fastgen.yaml
@@ -0,0 +1,9 @@
+cache_strategy: "fastgen"
+max_cache_length: [1.0]  # [Fixed] Control compression ratio with min_recovery_frac
+prompt_compression_strategy: "snapkv"  # Won't be used. Fastgen profiles attn and inserts directly.
+recent_window: 10  # Local window to consider for local strategies
+history_window_size: 400  # How many past steps to consider for attention importance calculation
+drop_amount: 0  # How frequently to calculate which tokens to evict (0 means we recalculate every step)
+attn_thresholding: False  # Whether to threshold attention scores or record raw probabilities
+min_recovery_frac: 0.85  # Higher is less compression (0.85 means we choose the policy which compresses the most tokens AND recovers 85% of the full attention matrix)
+heavy_hitter_frac: 0.3  # Higher is less compression for the heavy hitter strategy
diff --git a/eval_multi.py b/eval_multi.py
@@ -30,6 +30,7 @@
 
 HPARAMS = {
     "max_cache_length": [[8192], [4096], [2048], [1024], [512], [256], [128]],
+    "min_recovery_frac": [0.5, 0.6, 0.7, 0.8, 0.9, 0.95],
 }
 
 
diff --git a/model.py b/model.py
@@ -205,12 +205,14 @@ def reset_caches(self):
     def get_cache_stats(self, prompt_len, gen_len):
         stats = {}
         final_seq_len = prompt_len + gen_len
+        crs = []
         for layer_idx, layer in enumerate(self.layers):
-            stats[f"compression_ratio_{layer_idx}"] = (
-                layer.attention.kv_cache.compression_ratio(
-                    seq_len=torch.tensor(final_seq_len)
-                ).item()
-            )
+            cr = layer.attention.kv_cache.compression_ratio(
+                seq_len=torch.tensor(final_seq_len)
+            ).item()
+            stats[f"compression_ratio_{layer_idx}"] = cr
+            crs.append(cr)
+        stats["compression_ratio_avg"] = sum(crs) / len(crs)
         return stats
 
     def min_cache_length(self):
diff --git a/tokenizer.py b/tokenizer.py
@@ -201,7 +201,19 @@ def __init__(self, model_path):
         ]
 
     def special_ids(self) -> List[List[int]]:
-        return [[x] for x in self.tokenizer.special_token_ids]
+        if hasattr(self.tokenizer, "special_token_ids"):
+            return [[x] for x in self.tokenizer.special_token_ids]
+
+        # Its likely a tokenizer that has a special_tokens_map attribute
+        special_tokens_ = list(self.tokenizer.special_tokens_map.values())
+        special_tokens = []
+        for t in special_tokens_:
+            if type(t) == list:
+                special_tokens.extend(t)
+            else:
+                special_tokens.append(t)
+        special_tokens = list(set(special_tokens))
+        return [[self.tokenizer.convert_tokens_to_ids(t)] for t in special_tokens]
 
     def encode(self, text):
         return self.tokenizer.encode(text, add_special_tokens=False)

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@`
`30`	`30`
`31`	`31`	`HPARAMS = {`
`32`	`32`	`"max_cache_length": [[8192], [4096], [2048], [1024], [512], [256], [128]],`
	`33`	`+ "min_recovery_frac": [0.5, 0.6, 0.7, 0.8, 0.9, 0.95],`
`33`	`34`	`}`
`34`	`35`
`35`	`36`