add attention_sink.py

helunwencser · helunwencser · commit 47dcd57506d0 · 2024-10-30T14:30:49.000-07:00
This PR adds `KVCacheWithAttentionSink`, which is required for `AttentionSink`. It keeps the first `sink_size` tokens as attention sinks and maintains a sliding window with `window_size` for new tokens. Note: I am trying to implement and verify `AttentionSink` in eager mode first. So the current implementation may still have some lower errors or performance issue. For example, it does not support the case when dynamic shape is disabled. Will leave these problems to resolve when we are ready to deploy `AttentionSink` to edge. Differential Revision: [D65235798](https://our.internmc.facebook.com/intern/diff/D65235798/) [ghstack-poisoned]
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
@@ -92,6 +92,7 @@ runtime.python_library(
         "source_transformation/sdpa.py",
         "source_transformation/spin_quant.py",
         "source_transformation/vulkan_rope.py",
+        "source_transformation/attention_sink.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama",
@@ -212,3 +213,14 @@ runtime.python_test(
         "//executorch/examples/models/llama:llama_transformer",
     ],
 )
+
+runtime.python_test(
+    name = "attention_sink_test",
+    srcs = [
+        "source_transformation/test_attention_sink.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        ":export_library",
+    ],
+)
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Components for supporting Attention Sink. See
+# https://arxiv.org/abs/2309.17453 for more details about Attention Sink.
+
+from typing import Tuple
+
+import torch
+
+from torch import nn
+
+
+class KVCacheWithAttentionSink(nn.Module):
+    """
+    KV cache that supports attention sink. It keeps the initial few tokens as attention sink.
+    For other tokens, it uses a sliding window to keep the most recent tokens.
+
+    Parameters:
+        window_size: the size of the sliding window
+        sink_size: the number of initial tokens to keep as attention sink
+    """
+
+    def __init__(
+        self,
+        max_batch_size: int,
+        window_size: int,
+        sink_size: int,
+        n_heads: int,
+        head_dim: int,
+        transpose_cache: bool,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.sink_size = sink_size
+        self.cache_size = window_size + sink_size
+        self.is_transposed = transpose_cache
+        if transpose_cache:
+            cache_shape = (max_batch_size, n_heads, self.cache_size, head_dim)
+        else:
+            cache_shape = (max_batch_size, self.cache_size, n_heads, head_dim)
+
+        self.max_batch_size = max_batch_size
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.transpose_cache = transpose_cache
+        self.register_buffer(
+            "k_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+        self.register_buffer(
+            "v_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+
+    def update(
+        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        start_pos = input_pos[0].item()
+        torch._check_is_size(start_pos)
+        dim_to_slice = 2 if self.transpose_cache else 1
+        seq_length = k_val.size(dim_to_slice)
+
+        if start_pos + seq_length <= self.cache_size:
+            # There are still enough spaces in the cache to store the new tokens.
+            # No need to shift the existing tokens.
+            # pyre-ignore: Incompatible parameter type [6]
+            narrowed_k = self.k_cache.narrow(dim_to_slice, start_pos, seq_length)
+            # pyre-ignore: Incompatible parameter type [6]
+            narrowed_v = self.v_cache.narrow(dim_to_slice, start_pos, seq_length)
+
+            narrowed_k.copy_(k_val)
+            narrowed_v.copy_(v_val)
+        else:
+            # There are not enough spaces in the cache to store the new tokens.
+            # We need to shift the existing tokens.
+            num_to_evict = min(start_pos + seq_length - self.cache_size, seq_length)
+
+            # Shift the existing entries to the left
+            # pyre-ignore: Incompatible parameter type [6]
+            k_to_keep = self.k_cache.narrow(
+                dim_to_slice,
+                self.sink_size + num_to_evict,
+                self.window_size - num_to_evict,
+            ).clone()
+            # pyre-ignore: Incompatible parameter type [6]
+            v_to_keep = self.v_cache.narrow(
+                dim_to_slice,
+                self.sink_size + num_to_evict,
+                self.window_size - num_to_evict,
+            ).clone()
+            # pyre-ignore: Incompatible parameter type [6]
+            k_new_position = self.k_cache.narrow(
+                dim_to_slice, self.sink_size, self.window_size - num_to_evict
+            )
+            # pyre-ignore: Incompatible parameter type [6]
+            v_new_position = self.v_cache.narrow(
+                dim_to_slice, self.sink_size, self.window_size - num_to_evict
+            )
+            k_new_position.copy_(k_to_keep)
+            v_new_position.copy_(v_to_keep)
+
+            # Appending new entries
+            narrowed_k = self.k_cache.narrow(
+                dim_to_slice, self.cache_size - seq_length, seq_length
+            )
+            narrowed_v = self.v_cache.narrow(
+                dim_to_slice, self.cache_size - seq_length, seq_length
+            )
+            narrowed_k.copy_(k_val)
+            narrowed_v.copy_(v_val)
+        return self.k_cache, self.v_cache
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py
@@ -0,0 +1,178 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.examples.models.llama.source_transformation.attention_sink import (
+    KVCacheWithAttentionSink,
+)
+
+
+class KVCacheWithAttentionSinkTest(unittest.TestCase):
+
+    def _init_cache(self):
+        self.kv_cache = KVCacheWithAttentionSink(
+            max_batch_size=self.max_batch_size,
+            window_size=self.window_size,
+            sink_size=self.sink_size,
+            n_heads=self.n_heads,
+            head_dim=self.head_dim,
+            transpose_cache=self.transpose_cache,
+            dtype=self.dtype,
+        )
+
+    def setUp(self):
+        torch.manual_seed(42)
+        self.max_batch_size = 1
+        self.window_size = 28
+        self.sink_size = 4
+        self.n_heads = 8
+        self.head_dim = 16
+        self.transpose_cache = False
+        self.dtype = torch.float32
+        self._init_cache()
+
+    def test_update_empty_cache(self):
+        # KV cache is empty, update will fill sink tokens
+        input_pos = torch.tensor([0], dtype=torch.int32)
+        k = torch.ones((1, 1, 8, 16), dtype=self.dtype)
+        v = torch.ones((1, 1, 8, 16), dtype=self.dtype)
+
+        k_out, v_out = self.kv_cache.update(input_pos, k, v)
+
+        expected_k_out = torch.cat(
+            [
+                torch.ones((1, 1, 8, 16), dtype=self.dtype),
+                torch.zeros((1, 31, 8, 16), dtype=self.dtype),
+            ],
+            dim=1,
+        )
+        expected_v_out = torch.cat(
+            [
+                torch.ones((1, 1, 8, 16), dtype=self.dtype),
+                torch.zeros((1, 31, 8, 16), dtype=self.dtype),
+            ],
+            dim=1,
+        )
+
+        torch.testing.assert_close(k_out, expected_k_out)
+        torch.testing.assert_close(v_out, expected_v_out)
+
+    def test_update_without_shift(self):
+        # KV cache has enough spaces for new tokens, no shift
+        input_pos = torch.tensor([0], dtype=torch.int32)
+        k = torch.ones((1, 5, 8, 16), dtype=self.dtype)
+        v = torch.ones((1, 5, 8, 16), dtype=self.dtype)
+
+        self.kv_cache.update(input_pos, k, v)
+
+        input_pos = torch.tensor([5], dtype=torch.int32)
+        k = torch.full((1, 5, 8, 16), 2, dtype=self.dtype)
+        v = torch.full((1, 5, 8, 16), 2, dtype=self.dtype)
+
+        k_out, v_out = self.kv_cache.update(input_pos, k, v)
+
+        expected_k_out = torch.cat(
+            [
+                torch.ones((1, 5, 8, 16), dtype=self.dtype),
+                torch.full((1, 5, 8, 16), 2, dtype=self.dtype),
+                torch.zeros((1, 22, 8, 16), dtype=self.dtype),
+            ],
+            dim=1,
+        )
+        expected_v_out = torch.cat(
+            [
+                torch.ones((1, 5, 8, 16), dtype=self.dtype),
+                torch.full((1, 5, 8, 16), 2, dtype=self.dtype),
+                torch.zeros((1, 22, 8, 16), dtype=self.dtype),
+            ],
+            dim=1,
+        )
+
+        torch.testing.assert_close(k_out, expected_k_out)
+        torch.testing.assert_close(v_out, expected_v_out)
+
+    def test_update_with_some_shift(self):
+        # KV cache has some spaces for new tokens but not all, shift some tokens
+        input_pos = torch.tensor([0], dtype=torch.int32)
+        k = torch.ones((1, 5, 8, 16), dtype=self.dtype)
+        v = torch.ones((1, 5, 8, 16), dtype=self.dtype)
+
+        self.kv_cache.update(input_pos, k, v)
+
+        input_pos = torch.tensor([5], dtype=torch.int32)
+        k = torch.full((1, 5, 8, 16), 2, dtype=self.dtype)
+        v = torch.full((1, 5, 8, 16), 2, dtype=self.dtype)
+
+        self.kv_cache.update(input_pos, k, v)
+
+        input_pos = torch.tensor([10], dtype=torch.int32)
+        k = torch.full((1, 24, 8, 16), 3, dtype=self.dtype)
+        v = torch.full((1, 24, 8, 16), 3, dtype=self.dtype)
+
+        k_out, v_out = self.kv_cache.update(input_pos, k, v)
+
+        expected_k_out = torch.cat(
+            [
+                torch.ones((1, 4, 8, 16), dtype=self.dtype),
+                torch.full((1, 4, 8, 16), 2, dtype=self.dtype),
+                torch.full((1, 24, 8, 16), 3, dtype=self.dtype),
+            ],
+            dim=1,
+        )
+        expected_v_out = torch.cat(
+            [
+                torch.ones((1, 4, 8, 16), dtype=self.dtype),
+                torch.full((1, 4, 8, 16), 2, dtype=self.dtype),
+                torch.full((1, 24, 8, 16), 3, dtype=self.dtype),
+            ],
+            dim=1,
+        )
+
+        torch.testing.assert_close(k_out, expected_k_out)
+        torch.testing.assert_close(v_out, expected_v_out)
+
+    def test_update_with_all_shift(self):
+        # KV cache has no spaces for new tokens, shift all tokens
+        input_pos = torch.tensor([0], dtype=torch.int32)
+        k = torch.ones((1, 5, 8, 16), dtype=self.dtype)
+        v = torch.ones((1, 5, 8, 16), dtype=self.dtype)
+
+        self.kv_cache.update(input_pos, k, v)
+
+        input_pos = torch.tensor([5], dtype=torch.int32)
+        k = torch.full((1, 28, 8, 16), 2, dtype=self.dtype)
+        v = torch.full((1, 28, 8, 16), 2, dtype=self.dtype)
+
+        self.kv_cache.update(input_pos, k, v)
+
+        input_pos = torch.tensor([33], dtype=torch.int32)
+        k = torch.full((1, 6, 8, 16), 3, dtype=self.dtype)
+        v = torch.full((1, 6, 8, 16), 3, dtype=self.dtype)
+
+        k_out, v_out = self.kv_cache.update(input_pos, k, v)
+
+        expected_k_out = torch.cat(
+            [
+                torch.ones((1, 4, 8, 16), dtype=self.dtype),
+                torch.full((1, 22, 8, 16), 2, dtype=self.dtype),
+                torch.full((1, 6, 8, 16), 3, dtype=self.dtype),
+            ],
+            dim=1,
+        )
+        expected_v_out = torch.cat(
+            [
+                torch.ones((1, 4, 8, 16), dtype=self.dtype),
+                torch.full((1, 22, 8, 16), 2, dtype=self.dtype),
+                torch.full((1, 6, 8, 16), 3, dtype=self.dtype),
+            ],
+            dim=1,
+        )
+
+        torch.testing.assert_close(k_out, expected_k_out)
+        torch.testing.assert_close(v_out, expected_v_out)