Update generate.py to pull from generation_utils.py

Griffin Adams · Griffin Adams · commit 11db56ea9538 · 2024-06-25T01:08:52.000Z
diff --git a/generate.py b/generate.py
@@ -7,12 +7,14 @@
 import sys
 import time
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import torch._dynamo.config
 import torch._inductor.config
 
+from generation_utils import decode_one_token, prefill
+
 
 def device_sync(device):
     if "cuda" in device:
diff --git a/prompt_compression.py b/prompt_compression.py
@@ -68,6 +68,14 @@ def __init__(self, head_specific, **kwargs) -> None:
         self.kernel_size = 5
         self.observation_len = 16
 
+        self.pool = torch.nn.AvgPool1d(
+            self.kernel_size,
+            stride=1,
+            padding=self.kernel_size // 2,
+            ceil_mode=False,
+            count_include_pad=False,
+        )
+
     def is_compatible(self) -> bool:
         # Can only be used with head-specific KV-caches
         return self.head_specific
@@ -78,19 +86,12 @@ def requires_attn(self) -> bool:
     def __call__(self, input_pos, k_val, v_val, attn):
         assert self.head_specific, "SnapKV can only be used with head-specific KV-caches, e.g., placing the same token in different locations across heads)."
 
-        pool = torch.nn.AvgPool1d(
-            self.kernel_size,
-            stride=1,
-            padding=self.kernel_size // 2,
-            ceil_mode=False,
-            count_include_pad=False,
-        )
         priority = attn[:, :, -self.observation_len :, :].mean(dim=2)
         prev_shape = priority.shape
 
         # We'll be returning the attention history so we need to keep a copy before it's modified
         attn_history = priority.clone()
-        priority = pool(priority)
+        priority = self.pool(priority)
         assert (
             priority.shape == prev_shape
         ), f"Pooling operation should not change the dimension: {prev_shape} -> {priority.shape}"