Update on "implement position encoding for shifted tokens"

helunwencser · helunwencser · commit fad921e426ec · 2024-11-04T15:22:30.000-08:00
In AttentionSink, it uses tokens' positions in the KVCache instead of the actual text. When tokens get shifted in KVCache, it needs to update q and k's position embedding. In the original [implementation](https://github.com/mit-han-lab/streaming-llm) of AttentionSink with Rope, it caches the original q and k in KVCache and apply position embedding during inference. This PR adds `RopeWithAttentionSink`. It assumes that q and k are already encoded with their original position. When we shift tokens, we reapply the position delta. This has two benefits: - minimize our code since our existing `llama_transformer` applies rope embedding before doing KVCache update - avoid performance regression when tokens are not shifted because we don't need to reapply position encoding in KVCache for them Differential Revision: [D65366440](https://our.internmc.facebook.com/intern/diff/D65366440/) [ghstack-poisoned]
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
@@ -16,6 +16,49 @@
 from torch import nn
 
 
+class RopeWithAttentionSink(nn.Module):
+    """
+    Rope that helps adjust position encoding when tokens are shifted in KVCache.
+    For AttentionSink, when tokens are shifted in KVCache, we need to use positions
+    in KVCache instead of positions in the actual text.
+    """
+
+    def __init__(self, rope: Rope):
+        super().__init__()
+        self.rope = rope
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        original_position: int,
+        new_position: int,
+        seq_len: int,
+    ):
+        """
+        Rerotate q and k from original_position to new_position. This is done by rerotating q
+        and k with (new_position * theta - original_position * theta) with the following matrix:
+        (cos(delta), -sin(delta)
+         sin(delta), cos(delta))
+         where delta = new_position * theta - original_position * theta
+
+         Based on https://github.com/huggingface/transformers/blame/main/src/transformers/cache_utils.py#L961
+        """
+        original_freqs_cos = self.rope.freqs_cos.narrow(0, original_position, seq_len)
+        original_freqs_sin = self.rope.freqs_sin.narrow(0, original_position, seq_len)
+        new_freqs_cos = self.rope.freqs_cos.narrow(0, new_position, seq_len)
+        new_freqs_sin = self.rope.freqs_sin.narrow(0, new_position, seq_len)
+        rerotation_cos = (
+            new_freqs_cos * original_freqs_cos + new_freqs_sin * original_freqs_sin
+        )
+        rerotation_sin = (
+            new_freqs_sin * original_freqs_cos - new_freqs_cos * original_freqs_sin
+        )
+
+        q, k = self.rope.apply_rotary_emb(q, k, rerotation_cos, rerotation_sin)
+        return q, k
+
+
 class KVCacheWithAttentionSink(nn.Module):
     """
     KV cache that supports attention sink. It keeps the initial few tokens as attention sink.
@@ -114,46 +157,3 @@ def update(
             narrowed_k.copy_(k_val)
             narrowed_v.copy_(v_val)
         return self.k_cache, self.v_cache
-
-
-class RopeWithAttentionSink(nn.Module):
-    """
-    Rope that helps adjust position encoding when tokens are shifted in KVCache.
-    For AttentionSink, when tokens are shifted in KVCache, we need to use positions
-    in KVCache instead of positions in the actual text.
-    """
-
-    def __init__(self, rope: Rope):
-        super().__init__()
-        self.rope = rope
-
-    def forward(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        original_position: int,
-        new_position: int,
-        seq_len: int,
-    ):
-        """
-        Rerotate q and k from original_position to new_position. This is done by rerotating q
-        and k with (new_position * theta - original_position * theta) with the following matrix:
-        (cos(delta), -sin(delta)
-         sin(delta), cos(delta))
-         where delta = new_position * theta - original_position * theta
-
-         Based on https://github.com/huggingface/transformers/blame/main/src/transformers/cache_utils.py#L961
-        """
-        original_freqs_cos = self.rope.freqs_cos.narrow(0, original_position, seq_len)
-        original_freqs_sin = self.rope.freqs_sin.narrow(0, original_position, seq_len)
-        new_freqs_cos = self.rope.freqs_cos.narrow(0, new_position, seq_len)
-        new_freqs_sin = self.rope.freqs_sin.narrow(0, new_position, seq_len)
-        rerotation_cos = (
-            new_freqs_cos * original_freqs_cos + new_freqs_sin * original_freqs_sin
-        )
-        rerotation_sin = (
-            new_freqs_sin * original_freqs_cos - new_freqs_cos * original_freqs_sin
-        )
-
-        q, k = self.rope.apply_rotary_emb(q, k, rerotation_cos, rerotation_sin)
-        return q, k
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py
@@ -15,6 +15,113 @@
 )
 
 
+class RopeWithAttentionSinkTest(unittest.TestCase):
+
+    def setUp(self):
+        self.params = ModelArgs(use_kv_cache=True, enable_dynamic_shape=True)
+        self.rope = Rope(self.params)
+        self.rope_with_attention_sink = RopeWithAttentionSink(rope=self.rope)
+        self.seq_len = 32
+        self.n_local_heads = self.params.n_heads
+        self.n_local_kv_heads = self.params.n_heads
+        self.head_dim = self.params.dim // self.params.n_heads
+        self.q = torch.ones(
+            (1, self.seq_len, self.n_local_heads, self.head_dim), dtype=torch.float32
+        )
+        self.k = torch.full(
+            (1, self.seq_len, self.n_local_kv_heads, self.head_dim),
+            2,
+            dtype=torch.float32,
+        )
+
+    def test_rotate_backward(self):
+        original_position = 128
+        new_position = 127
+
+        pre_rotated_q, pre_rotated_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([original_position], dtype=torch.int32),
+        )
+
+        q, k = self.rope_with_attention_sink.forward(
+            q=pre_rotated_q,
+            k=pre_rotated_k,
+            original_position=original_position,
+            new_position=new_position,
+            seq_len=self.seq_len,
+        )
+
+        expected_q, expected_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([new_position], dtype=torch.int32),
+        )
+
+        torch.testing.assert_close(q, expected_q)
+        torch.testing.assert_close(k, expected_k)
+
+    def test_rotate_inplace(self):
+        original_position = 128
+        new_position = 128
+
+        pre_rotated_q, pre_rotated_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([original_position], dtype=torch.int32),
+        )
+
+        q, k = self.rope_with_attention_sink.forward(
+            q=pre_rotated_q,
+            k=pre_rotated_k,
+            original_position=original_position,
+            new_position=new_position,
+            seq_len=self.seq_len,
+        )
+
+        expected_q, expected_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([new_position], dtype=torch.int32),
+        )
+
+        torch.testing.assert_close(q, expected_q)
+        torch.testing.assert_close(k, expected_k)
+
+    def test_rotate_forward(self):
+        original_position = 128
+        new_position = 129
+
+        pre_rotated_q, pre_rotated_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([original_position], dtype=torch.int32),
+        )
+
+        q, k = self.rope_with_attention_sink.forward(
+            q=pre_rotated_q,
+            k=pre_rotated_k,
+            original_position=original_position,
+            new_position=new_position,
+            seq_len=self.seq_len,
+        )
+
+        expected_q, expected_k = self.rope.forward(
+            q=self.q,
+            k=self.k,
+            seq_len=self.seq_len,
+            input_pos=torch.tensor([new_position], dtype=torch.int32),
+        )
+
+        torch.testing.assert_close(q, expected_q)
+        torch.testing.assert_close(k, expected_k)
+
+
 class KVCacheWithAttentionSinkTest(unittest.TestCase):
 
     def _init_cache(self):
@@ -178,110 +285,3 @@ def test_update_with_all_shift(self):
 
         torch.testing.assert_close(k_out, expected_k_out)
         torch.testing.assert_close(v_out, expected_v_out)
-
-
-class RopeWithAttentionSinkTest(unittest.TestCase):
-
-    def setUp(self):
-        self.params = ModelArgs(use_kv_cache=True, enable_dynamic_shape=True)
-        self.rope = Rope(self.params)
-        self.rope_with_attention_sink = RopeWithAttentionSink(rope=self.rope)
-        self.seq_len = 32
-        self.n_local_heads = self.params.n_heads
-        self.n_local_kv_heads = self.params.n_heads
-        self.head_dim = self.params.dim // self.params.n_heads
-        self.q = torch.ones(
-            (1, self.seq_len, self.n_local_heads, self.head_dim), dtype=torch.float32
-        )
-        self.k = torch.full(
-            (1, self.seq_len, self.n_local_kv_heads, self.head_dim),
-            2,
-            dtype=torch.float32,
-        )
-
-    def test_rotate_backward(self):
-        original_position = 128
-        new_position = 127
-
-        pre_rotated_q, pre_rotated_k = self.rope.forward(
-            q=self.q,
-            k=self.k,
-            seq_len=self.seq_len,
-            input_pos=torch.tensor([original_position], dtype=torch.int32),
-        )
-
-        q, k = self.rope_with_attention_sink.forward(
-            q=pre_rotated_q,
-            k=pre_rotated_k,
-            original_position=original_position,
-            new_position=new_position,
-            seq_len=self.seq_len,
-        )
-
-        expected_q, expected_k = self.rope.forward(
-            q=self.q,
-            k=self.k,
-            seq_len=self.seq_len,
-            input_pos=torch.tensor([new_position], dtype=torch.int32),
-        )
-
-        torch.testing.assert_close(q, expected_q)
-        torch.testing.assert_close(k, expected_k)
-
-    def test_rotate_inplace(self):
-        original_position = 128
-        new_position = 128
-
-        pre_rotated_q, pre_rotated_k = self.rope.forward(
-            q=self.q,
-            k=self.k,
-            seq_len=self.seq_len,
-            input_pos=torch.tensor([original_position], dtype=torch.int32),
-        )
-
-        q, k = self.rope_with_attention_sink.forward(
-            q=pre_rotated_q,
-            k=pre_rotated_k,
-            original_position=original_position,
-            new_position=new_position,
-            seq_len=self.seq_len,
-        )
-
-        expected_q, expected_k = self.rope.forward(
-            q=self.q,
-            k=self.k,
-            seq_len=self.seq_len,
-            input_pos=torch.tensor([new_position], dtype=torch.int32),
-        )
-
-        torch.testing.assert_close(q, expected_q)
-        torch.testing.assert_close(k, expected_k)
-
-    def test_rotate_forward(self):
-        original_position = 128
-        new_position = 129
-
-        pre_rotated_q, pre_rotated_k = self.rope.forward(
-            q=self.q,
-            k=self.k,
-            seq_len=self.seq_len,
-            input_pos=torch.tensor([original_position], dtype=torch.int32),
-        )
-
-        q, k = self.rope_with_attention_sink.forward(
-            q=pre_rotated_q,
-            k=pre_rotated_k,
-            original_position=original_position,
-            new_position=new_position,
-            seq_len=self.seq_len,
-        )
-
-        expected_q, expected_k = self.rope.forward(
-            q=self.q,
-            k=self.k,
-            seq_len=self.seq_len,
-            input_pos=torch.tensor([new_position], dtype=torch.int32),
-        )
-
-        torch.testing.assert_close(q, expected_q)
-        torch.testing.assert_close(k, expected_k)