Update on "add attention_sink.py"

helunwencser · helunwencser · commit dbbaa85f3ad3 · 2024-11-21T22:06:35.000-08:00
This PR adds `KVCacheWithAttentionSink`, which is required for `AttentionSink`. It keeps the first `sink_size` tokens as attention sinks and maintains a sliding window with `window_size` for new tokens. Note: I am trying to implement and verify `AttentionSink` in eager mode first. So the current implementation may still have some lower errors or performance issue. For example, it does not support the case when dynamic shape is disabled. Will leave these problems to resolve when we are ready to deploy `AttentionSink` to edge. Differential Revision: [D65235798](https://our.internmc.facebook.com/intern/diff/D65235798/) [ghstack-poisoned]
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
@@ -220,7 +220,9 @@ runtime.python_test(
     srcs = [
         "source_transformation/test_attention_sink.py",
     ],
+    supports_static_listing = False,
     deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
         "//caffe2:torch",
         ":export_library",
     ],
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
@@ -100,7 +100,6 @@ def __init__(
         self.sink_size = sink_size
         self.eviction_batch_size = eviction_batch_size
         self.position_shift = 0
-        assert not transpose_cache
 
     def evict_tokens(self, input_pos: torch.Tensor, seq_len: int) -> int:
         """
@@ -134,16 +133,26 @@ def evict_tokens(self, input_pos: torch.Tensor, seq_len: int) -> int:
                 self.sink_size + num_to_evict,  # pyre-ignore [6]
                 num_to_keep,  # pyre-ignore [6]
             )
+            if self.transpose_cache:
+                k_to_keep = self.rope.rerotate_k(
+                    k=k_to_keep.transpose(1, 2),
+                    original_position=(  # pyre-ignore [6]
+                        self.sink_size + num_to_evict
+                    ),
+                    new_position=self.sink_size,
+                ).transpose(1, 2)
+            else:
+                k_to_keep = self.rope.rerotate_k(
+                    k=k_to_keep,
+                    original_position=(  # pyre-ignore [6]
+                        self.sink_size + num_to_evict
+                    ),
+                    new_position=self.sink_size,
+                )
             self.k_cache = torch.cat(
                 [
                     self.k_cache.narrow(dim_to_slice, 0, self.sink_size),
-                    self.rope.rerotate_k(
-                        k=k_to_keep,
-                        original_position=(  # pyre-ignore [6]
-                            self.sink_size + num_to_evict
-                        ),
-                        new_position=self.sink_size,
-                    ),
+                    k_to_keep,
                     torch.zeros_like(
                         self.k_cache.narrow(
                             dim_to_slice, 0, num_empty_space  # pyre-ignore [6]
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py