implement position encoding for shifted tokens

helunwencser · helunwencser · commit 51d27f451c30 · 2024-11-04T15:17:48.000-08:00
In AttentionSink, it uses tokens' positions in the KVCache instead of the actual text. When tokens get shifted in KVCache, it needs to update q and k's position embedding. In the original [implementation](https://github.com/mit-han-lab/streaming-llm) of AttentionSink with Rope, it caches the original q and k in KVCache and apply position embedding during inference. This PR adds `RopeWithAttentionSink`. It assumes that q and k are already encoded with their original position. When we shift tokens, we reapply the position delta. This has two benefits: - minimize our code since our existing `llama_transformer` applies rope embedding before doing KVCache update - avoid performance regression when tokens are not shifted because we don't need to reapply position encoding in KVCache for them Differential Revision: [D65366440](https://our.internmc.facebook.com/intern/diff/D65366440/) [ghstack-poisoned]
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
@@ -11,6 +11,8 @@
 
 import torch
 
+from executorch.examples.models.llama.llama_transformer import Rope
+
 from torch import nn
 
 
@@ -112,3 +114,46 @@ def update(
             narrowed_k.copy_(k_val)
             narrowed_v.copy_(v_val)
         return self.k_cache, self.v_cache
+
+
+class RopeWithAttentionSink(nn.Module):
+    """
+    Rope that helps adjust position encoding when tokens are shifted in KVCache.
+    For AttentionSink, when tokens are shifted in KVCache, we need to use positions
+    in KVCache instead of positions in the actual text.
+    """
+
+    def __init__(self, rope: Rope):
+        super().__init__()
+        self.rope = rope
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        original_position: int,
+        new_position: int,
+        seq_len: int,
+    ):
+        """
+        Rerotate keys from original_position to new_position. This is done by rerotating
+        keys with (new_position * theta - original_position * theta) with the following matrix:
+        (cos(delta), -sin(delta)
+         sin(delta), cos(delta))
+         where delta = new_position * theta - original_position * theta
+
+         Based on https://github.com/huggingface/transformers/blame/main/src/transformers/cache_utils.py#L961
+        """
+        original_freqs_cos = self.rope.freqs_cos.narrow(0, original_position, seq_len)
+        original_freqs_sin = self.rope.freqs_sin.narrow(0, original_position, seq_len)
+        new_freqs_cos = self.rope.freqs_cos.narrow(0, new_position, seq_len)
+        new_freqs_sin = self.rope.freqs_sin.narrow(0, new_position, seq_len)
+        rerotation_cos = (
+            new_freqs_cos * original_freqs_cos + new_freqs_sin * original_freqs_sin
+        )
+        rerotation_sin = (
+            new_freqs_sin * original_freqs_cos - new_freqs_cos * original_freqs_sin
+        )
+
+        q, k = self.rope.apply_rotary_emb(q, k, rerotation_cos, rerotation_sin)
+        return q, k
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py
@@ -7,9 +7,11 @@
 import unittest
 
 import torch
+from executorch.examples.models.llama.llama_transformer import ModelArgs, Rope
 
 from executorch.examples.models.llama.source_transformation.attention_sink import (
     KVCacheWithAttentionSink,
+    RopeWithAttentionSink,
 )
 
 
@@ -176,3 +178,110 @@ def test_update_with_all_shift(self):
 
         torch.testing.assert_close(k_out, expected_k_out)
         torch.testing.assert_close(v_out, expected_v_out)
+
+
+class RopeWithAttentionSinkTest(unittest.TestCase):
+
+    def setUp(self):
+        self.params = ModelArgs(use_kv_cache=True, enable_dynamic_shape=True)
+        self.rope = Rope(self.params)
+        self.rope_with_attention_sink = RopeWithAttentionSink(rope=self.rope)
+        self.seq_len = 32
+        self.n_local_heads = self.params.n_heads
+        self.n_local_kv_heads = self.params.n_heads
+        self.head_dim = self.params.dim // self.params.n_heads
+        self.q = torch.ones(
+            (1, self.seq_len, self.n_local_heads, self.head_dim), dtype=torch.float32
+        )
+        self.k = torch.full(
+            (1, self.seq_len, self.n_local_kv_heads, self.head_dim),
+            2,
+            dtype=torch.float32,
+        )
+
+    def test_rotate_backward(self):
+        original_position = 128
+        new_position = 127
+
+        pre_rotated_q, pre_rotated_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([original_position], dtype=torch.int32),
+        )
+
+        q, k = self.rope_with_attention_sink.forward(
+            q=pre_rotated_q,
+            k=pre_rotated_k,
+            original_position=original_position,
+            new_position=new_position,
+            seq_len=self.seq_len,
+        )
+
+        expected_q, expected_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([new_position], dtype=torch.int32),
+        )
+
+        torch.testing.assert_close(q, expected_q)
+        torch.testing.assert_close(k, expected_k)
+
+    def test_rotate_inplace(self):
+        original_position = 128
+        new_position = 128
+
+        pre_rotated_q, pre_rotated_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([original_position], dtype=torch.int32),
+        )
+
+        q, k = self.rope_with_attention_sink.forward(
+            q=pre_rotated_q,
+            k=pre_rotated_k,
+            original_position=original_position,
+            new_position=new_position,
+            seq_len=self.seq_len,
+        )
+
+        expected_q, expected_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([new_position], dtype=torch.int32),
+        )
+
+        torch.testing.assert_close(q, expected_q)
+        torch.testing.assert_close(k, expected_k)
+
+    def test_rotate_forward(self):
+        original_position = 128
+        new_position = 129
+
+        pre_rotated_q, pre_rotated_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([original_position], dtype=torch.int32),
+        )
+
+        q, k = self.rope_with_attention_sink.forward(
+            q=pre_rotated_q,
+            k=pre_rotated_k,
+            original_position=original_position,
+            new_position=new_position,
+            seq_len=self.seq_len,
+        )
+
+        expected_q, expected_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([new_position], dtype=torch.int32),
+        )
+
+        torch.testing.assert_close(q, expected_q)
+        torch.testing.assert_close(k, expected_k)