Adapter annotated (#352)

Andrei-Aksionov · web-flow · commit 0819c7f114d3 · 2023-06-02T16:05:35.000+02:00
diff --git a/lit_llama/adapter.py b/lit_llama/adapter.py
@@ -2,6 +2,41 @@
 
 LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention
 https://arxiv.org/abs/2303.16199
+
+                                                                             |              Prefix cross-attention
+                                                                             |
+  ┌─────────────────┐                                                        |               ┌──────────────────┐
+  ┆        x        ┆                                                        |               ┆      prefix      ┆
+  └─────────────────┘                                                        |               └──────────────────┘
+           |                                                                 |                        |
+           ▼                                                                 |                        ▼
+  ┌──────────────────┐                                                       |              ┌─────────────────────┐
+  ┆  self-attention  ┆ --------------------------------------------------------------┐      ┆  linear projection  ┆
+  └──────────────────┘                                                       |       ┆      └─────────────────────┘
+           |                                                                 |       ┆                |         \
+           ▼                                                                 |       ▼                ▼          ▼
+         ╭───╮     ┌────────────────┐ ╭───╮ ┌──────────────────────────┐     |  ┌─────────┐    ┌──────────────┐  ┌────────────────┐
+         ┆ + ┆ ◀── ┆  gating factor ┆-┆ x ┆-┆  prefix cross-attention  ┆     |  ┆  query  ┆    ┆  prefix key  ┆  ┆  prefix value  ┆
+         ╰───╯     └────────────────┘ ╰───╯ └──────────────────────────┘     |  └─────────┘    └──────────────┘  └────────────────┘
+           |                                                                 |          \             |           /
+           ▼                                                                 |           ▼            ▼          ▼
+                                                                             |         ┌────────────────────────────────┐
+                                                                             |         ┆  scaled dot-product attention  ┆
+                                                                             |         └────────────────────────────────┘
+
+
+In order to inject learnable information from the prefix to pretrained weights we need to sum outputs from
+self-attention and prefix cross-attention (times gating factor). For prefix cross-attention we need `query` (from
+self-attention as a result of linear projection), `prefix key` and `prefix value` (from cross-attention as a result of
+linear projection).
+The output of prefix cross-attention is multiplied by gating factor, which is a learnable parameter that is needed to
+avoid potential disruption of pretrained weights caused by incorporating randomly initialized tensors. This factor is
+initialized with zeros to avoid noise from the adaption prompts at the early training stage.
+More about it: https://lightning.ai/pages/community/article/understanding-llama-adapters/
+
+Notes about implementation: as per paper adapter's prefix is concatenated with the input, while here outputs of
+self-attention and prefix cross-attention are summed. Both variants are mathematically equivalent:
+https://github.com/ZrrSkywalker/LLaMA-Adapter/issues/47
 """
 # mypy: ignore-errors
 from dataclasses import dataclass
@@ -37,7 +72,8 @@ def __init__(self, config: LLaMAConfig, block_idx: int) -> None:
         if block_idx >= config.adapter_start_layer:
             # adapter embedding layer
             self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd)
-            # gate for adaption
+            # a learnable gating factor (to avoid potential disruption of pretrained weights) initialized with zeros (to
+            # avoid noise from adaption prompts at the early training stage)
             self.gating_factor = torch.nn.Parameter(torch.zeros(1, config.n_head, 1, 1))
 
         self.n_head = config.n_head
@@ -57,57 +93,81 @@ def forward(
         kv_cache: Optional[KVCache] = None,
         adapter_kv_cache: Optional[KVCache] = None,
     ) -> Tuple[torch.Tensor, Optional[KVCache], Optional[KVCache]]:
-        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
-
-        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
-        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
-
+        # notation:
+        # - B  | batch
+        # - T  | time-step (sequence length)
+        # - C  | embeddings size (n_embd) = head size * num heads
+        # - hs | head size
+        # - nh | number of heads
+
+        B, T, C = x.size()
+
+        # instead of calculating `query`, `key` and `value` by separately multiplying input `x` with corresponding
+        # weight matrices do it (for all heads) in a single multiplication with a matrix of 3x size (concatenated
+        # weights for q, k, v) and then split the result along `embedding size` dimension
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2) # (B, T, 3 * C) --> 3 * (B, T, C)
+
+        # in order to move head_size (hs) dimension right after batch (B) dimension, we need to first split
+        # embedding size (C) dimension into num_heads (nh) and head_size (hs)
         head_size = C // self.n_head
         k = k.view(B, T, self.n_head, head_size)
         q = q.view(B, T, self.n_head, head_size)
         v = v.view(B, T, self.n_head, head_size)
 
-        q = apply_rope(q, rope)
-        k = apply_rope(k, rope)
+        # "Unlike standard positional embeddings rotary embeddings must be applied at every layer"
+        q = apply_rope(q, rope) # (B, T, nh, hs)
+        k = apply_rope(k, rope) # (B, T, nh, hs)
 
+        # now `key`, 'query` and `value` tensors are correctly represented: for each element in a batch (B)
+        # there is a number of heads (nh) and for each head there is a sequence of elements (T), each of them is
+        # represented by a vector of size `hs`
         k = k.transpose(1, 2)  # (B, nh, T, hs)
         q = q.transpose(1, 2)  # (B, nh, T, hs)
         v = v.transpose(1, 2)  # (B, nh, T, hs)
 
         if kv_cache is not None:
-            cache_k, cache_v = kv_cache
+            cache_k, cache_v = kv_cache # 2 * (B, nh, max_seq_length, hs)
             # check if reached token limit
             if input_pos[-1] >= max_seq_length:
+                # if we reached token limit and thus there is no space to put newly calculated `key` and `value`
+                # right next to cached ones, we need to rotate cache tensor along `max_seq_length` dimension by one
+                # element to the left: this will free up space for new `key` and `value`
                 input_pos = torch.tensor(max_seq_length - 1, device=input_pos.device)
                 # shift 1 position to the left
                 cache_k = torch.roll(cache_k, -1, dims=2)
                 cache_v = torch.roll(cache_v, -1, dims=2)
-            k = cache_k.index_copy(2, input_pos, k)
-            v = cache_v.index_copy(2, input_pos, v)
+            k = cache_k.index_copy(2, input_pos, k) # (B, nh, max_seq_length, hs)
+            v = cache_v.index_copy(2, input_pos, v) # (B, nh, max_seq_length, hs)
             kv_cache = k, v
 
         # efficient attention using Flash Attention CUDA kernels
-        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        # ↓ (B, nh, T, hs) @ (B, nh, T, hs).mT --> (B, nh, T, T) @ (B, nh, T, hs) --> (B, nh, T, hs)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0) # (B, nh, T, hs)
 
+        # "Adapters are applied to the topmost layers to better tune the language
+        # representations with higher-level semantics".
         if self.block_idx >= self.adapter_start_layer:
             if adapter_kv_cache is not None:
-                ak, av = adapter_kv_cache
+                ak, av = adapter_kv_cache # 2 * (B, nh, aT, hs)
             else:
                 prefix = self.adapter_wte.weight.reshape(1, self.adapter_prompt_length, self.n_embd)
                 aT = prefix.size(1)
-                _, ak, av = self.c_attn(prefix).split(self.n_embd, dim=2)
-                ak = ak.view(1, aT, self.n_head, head_size).repeat(B, 1, 1, 1).transpose(1, 2)
-                av = av.view(1, aT, self.n_head, head_size).repeat(B, 1, 1, 1).transpose(1, 2)
+                _, ak, av = self.c_attn(prefix).split(self.n_embd, dim=2) # (1, aT, 3 * C) --> 3 * (1, aT, C)
+                ak = ak.view(1, aT, self.n_head, head_size).repeat(B, 1, 1, 1).transpose(1, 2) # (B, nh, aT, hs)
+                av = av.view(1, aT, self.n_head, head_size).repeat(B, 1, 1, 1).transpose(1, 2) # (B, nh, aT, hs)
                 adapter_kv_cache = (ak, av)
 
-            amask = torch.ones(q.shape[-2], ak.shape[-2], dtype=torch.bool, device=x.device)
-            ay = F.scaled_dot_product_attention(q, ak, av, attn_mask=amask, dropout_p=0.0, is_causal=False)
+            # Apply cross-attention with `query`, `adapter_key`, `adapter_value` and sum the output with the output
+            # obtained from self-attention step. This is mathematically equivalent to concatenation of prefix and input as per paper.
+            amask = torch.ones(q.shape[-2], ak.shape[-2], dtype=torch.bool, device=x.device) # (T, aT)
+            # ↓ (B, nh, T, hs) @ (B, nh, aT, hs).mT --> (B, nh, T, aT) @ (B, nh, aT, hs) --> (B, nh, T, hs)
+            ay = F.scaled_dot_product_attention(q, ak, av, attn_mask=amask, dropout_p=0.0, is_causal=False) # (B, nh, T, hs)
             y = y + self.gating_factor * ay
 
         y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
 
         # output projection
-        y = self.c_proj(y)
+        y = self.c_proj(y) # (B, T, C)
 
         return y, kv_cache, adapter_kv_cache
 
@@ -202,9 +262,9 @@ def forward(
         assert T <= block_size, f"Cannot forward sequence of length {T}, block size is only {block_size}"
 
         if self.rope_cache is None:
-            self.rope_cache = self.build_rope_cache(idx)
+            self.rope_cache = self.build_rope_cache(idx) # (block_size, head_size / 2, 2)
         if self.mask_cache is None:
-            self.mask_cache = self.build_mask_cache(idx)
+            self.mask_cache = self.build_mask_cache(idx) # (1, 1, block_size, block_size)
 
         if input_pos is not None:
             rope = self.rope_cache.index_select(0, input_pos)
@@ -215,7 +275,7 @@ def forward(
             mask = self.mask_cache[:, :, :T, :T]
 
         # forward the model itself
-        x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        x = self.transformer.wte(idx)  # token embeddings of shape (B, T, n_embd)
 
         if input_pos is None:  # proxy for use_cache=False
             for block in self.transformer.h:
@@ -235,9 +295,9 @@ def forward(
                     x, rope, mask, max_seq_length, input_pos, self.kv_caches[i], self.adapter_kv_caches[i]
                 )
 
-        x = self.transformer.ln_f(x)
+        x = self.transformer.ln_f(x) # (B, T, n_embd)
 
-        logits = self.lm_head(x)  # (b, t, vocab_size)
+        logits = self.lm_head(x)  # (B, T, vocab_size)
 
         return logits