Add random prompt compression strategy.

Griffin Adams · Griffin Adams · commit 127773ab3f09 · 2024-07-02T20:48:11.000Z
diff --git a/cache.py b/cache.py
@@ -35,7 +35,7 @@ def add_cache_arguments(parser: argparse.ArgumentParser):
     group.add_argument(
         "--prompt_compression_strategy",  # This doesn't matter if args.feed_long_prompts is True
         default="recent_global",
-        choices=["recent_global", "snapkv", "l2"],
+        choices=["recent_global", "snapkv", "l2", "random"],
         help="If |prompt| exceeds max_cache_length, we need to specify a strategy for compressing it to max_cache_length.",
     )
 
diff --git a/cache_configs/random.yaml b/cache_configs/random.yaml
@@ -1,4 +1,4 @@
 cache_strategy: "random"
 max_cache_length: [1024]
 feed_long_prompt: True
-global_tokens: 1
+global_tokens: 4
diff --git a/cache_configs/scissor.yaml b/cache_configs/scissor.yaml
@@ -2,7 +2,7 @@ cache_strategy: "scissor"
 max_cache_length: [1024]
 global_tokens: 4
 history_window_size: 400
-drop_amount: 0.0001
+drop_amount: 0
 recent_window: 10
 attn_thresholding: False
 prompt_compression_strategy: "snapkv"
diff --git a/prompt_compression.py b/prompt_compression.py
@@ -24,6 +24,37 @@ def is_compatible(self) -> bool:
         pass
 
 
+class PromptCompressorRandom(PromptCompressor):
+    def __init__(self, head_specific, **kwargs) -> None:
+        super().__init__(head_specific, **kwargs)
+
+    def is_compatible(self) -> bool:
+        # Can be used with any cache
+        return True
+
+    def requires_attn(self) -> bool:
+        return False
+
+    def __call__(self, input_pos, k_val, v_val):
+        seq_len = input_pos.shape[0]
+        global_idxs = torch.arange(self.global_tokens, device=input_pos.device)
+        rand_idxs = (
+            (
+                self.global_tokens
+                + torch.randperm(seq_len - self.global_tokens, device=input_pos.device)[
+                    : self.max_cache_length - self.global_tokens
+                ]
+            )
+            .sort()
+            .values
+        )
+        keep_idxs = torch.cat([global_idxs, rand_idxs], dim=0)
+        assert len(keep_idxs) == self.max_cache_length
+        k_val = k_val[:, :, keep_idxs]
+        v_val = v_val[:, :, keep_idxs]
+        return keep_idxs, k_val, v_val
+
+
 class PromptCompressorRecentGlobal(PromptCompressor):
     def __init__(self, head_specific, **kwargs) -> None:
         super().__init__(head_specific, **kwargs)
@@ -84,9 +115,10 @@ def requires_attn(self) -> bool:
         return True
 
     def __call__(self, input_pos, k_val, v_val, attn):
-        assert self.head_specific, "SnapKV can only be used with head-specific KV-caches, e.g., placing the same token in different locations across heads)."
+        seq_len = input_pos.shape[0]
+        obs_len = min(self.observation_len, seq_len)
 
-        priority = attn[:, :, -self.observation_len :, :].mean(dim=2)
+        priority = attn[:, :, -obs_len:, :].mean(dim=2)
         prev_shape = priority.shape
 
         # We'll be returning the attention history so we need to keep a copy before it's modified
@@ -95,8 +127,9 @@ def __call__(self, input_pos, k_val, v_val, attn):
         assert (
             priority.shape == prev_shape
         ), f"Pooling operation should not change the dimension: {prev_shape} -> {priority.shape}"
-        priority[:, :, -self.observation_len :] = (
-            1.0  # Ensure the observation window is selected
+        priority[:, :, -obs_len:] = 1.0  # Ensure the observation window is selected
+        priority[:, :, : self.global_tokens] = (
+            1.0  # Ensure the global tokens are selected
         )
         keep_idxs = (
             priority.topk(self.max_cache_length, dim=-1).indices.sort(dim=-1).values
@@ -152,5 +185,7 @@ def prompt_compressor_constructor(strategy):
         return PromptCompressorSnapKV
     elif strategy == "l2":
         return PromptCompressorL2
+    elif strategy == "random":
+        return PromptCompressorRandom
     else:
         raise ValueError(f"Unknown prompt compression strategy: {strategy}")

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def add_cache_arguments(parser: argparse.ArgumentParser):`
`35`	`35`	`group.add_argument(`
`36`	`36`	`"--prompt_compression_strategy", # This doesn't matter if args.feed_long_prompts is True`
`37`	`37`	`default="recent_global",`
`38`		`- choices=["recent_global", "snapkv", "l2"],`
	`38`	`+ choices=["recent_global", "snapkv", "l2", "random"],`
`39`	`39`	`help="If \|prompt\| exceeds max_cache_length, we need to specify a strategy for compressing it to max_cache_length.",`
`40`	`40`	`)`
`41`	`41`