feat/MLA (#2113)

ysjprojects · pre-commit-ci[bot] · web-flow · commit db8e87851772 · 2025-09-08T22:00:30.000+02:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/extensions/thunder/pretrain.py b/extensions/thunder/pretrain.py
@@ -23,7 +23,7 @@
 from litgpt import Tokenizer
 from litgpt.args import EvalArgs, LogArgs, TrainArgs
 from litgpt.data import DataModule, TinyLlama
-from litgpt.model import GPT, Block, CausalSelfAttention, Config, LLaMAMLP
+from litgpt.model import GPT, Block, CausalSelfAttention, Config, LLaMAMLP, MultiheadLatentAttention
 from litgpt.utils import (
     CLI,
     CycleIterator,
@@ -461,7 +461,7 @@ def init_weights(module, std):
 
     # need a separate loop because `mod.proj` below is a `nn.Linear` too
     for mod in model.modules():
-        if isinstance(mod, (LLaMAMLP, CausalSelfAttention)):
+        if isinstance(mod, (LLaMAMLP, CausalSelfAttention, MultiheadLatentAttention)):
             mod.proj.reset_parameters = partial(init_weights, mod.proj, std=(1 / math.sqrt(n_embd) / n_layer))
 
     if not isinstance(fabric.strategy, FSDPStrategy):
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -93,6 +93,7 @@ class Config:
     final_logit_softcapping: Optional[float] = None
     norm_1: bool = True
     norm_2: bool = True
+    latent_attention: Optional[dict] = None
     # The base period of the RoPE embeddings for local attention.
     # If not provided, rope_theta will be used for both local and global attention.
     rope_local_base_freq: Optional[float] = None
@@ -133,6 +134,23 @@ def __post_init__(self):
         if self.rope_local_base_freq is not None and self.rope_indices is None:
             self.rope_indices = [1] * self.n_layer
 
+        if self.latent_attention is not None:
+            self.q_lora_rank = self.latent_attention.get("q_lora_rank")
+            self.kv_lora_rank = self.latent_attention.get("kv_lora_rank")
+            self.qk_rope_head_dim = self.latent_attention.get("qk_rope_head_dim")
+            self.qk_nope_head_dim = self.latent_attention.get("qk_nope_head_dim")
+            self.v_head_dim = self.latent_attention.get("v_head_dim")
+            assert (
+                self.q_lora_rank
+                and self.kv_lora_rank
+                and self.qk_rope_head_dim
+                and self.qk_nope_head_dim
+                and self.v_head_dim
+            ) is not None
+            assert self.n_head == self.n_query_groups, "Latent attention does not support MQA/GQA"
+            self.qk_head_dim = self.qk_rope_head_dim + self.qk_nope_head_dim
+            self.rope_n_elem = self.qk_rope_head_dim
+
     @classmethod
     def from_name(cls, name: str, **kwargs: Any) -> Optional[Self]:
         if name not in name_to_config:
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -272,7 +272,11 @@ def __init__(
             )
 
         self.norm_1 = nn.Identity() if not config.norm_1 else config.norm_class(config.n_embd, eps=config.norm_eps)
-        self.attn = CausalSelfAttention(config, block_idx)
+        self.attn = (
+            CausalSelfAttention(config, block_idx)
+            if not config.latent_attention
+            else MultiheadLatentAttention(config, block_idx)
+        )
         self.post_attention_norm = (
             config.norm_class(config.n_embd, eps=config.norm_eps) if config.post_attention_norm else nn.Identity()
         )
@@ -549,6 +553,146 @@ def _load_from_state_dict(self, state_dict: dict, prefix: str, *args: Any, **kwa
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
 
 
+class MultiheadLatentAttention(nn.Module):
+    def __init__(self, config: Config, block_idx: int) -> None:
+        super().__init__()
+
+        self.q_a_proj = nn.Linear(config.n_embd, config.q_lora_rank, bias=config.attn_bias)
+        self.q_a_norm = RMSNorm(config.q_lora_rank, eps=config.norm_eps)
+        self.q_b_proj = nn.Linear(config.q_lora_rank, config.n_head * config.qk_head_dim, bias=config.bias)
+
+        self.kv_a_proj_with_mqa = nn.Linear(
+            config.n_embd, config.kv_lora_rank + config.qk_rope_head_dim, bias=config.attn_bias
+        )
+        self.kv_a_norm = RMSNorm(config.kv_lora_rank, eps=config.norm_eps)
+        self.kv_b_proj = nn.Linear(
+            config.kv_lora_rank,
+            config.n_query_groups * (config.qk_nope_head_dim + config.v_head_dim),
+            bias=config.bias,
+        )
+
+        # output projection
+        self.proj = nn.Linear(config.n_head * config.v_head_dim, config.n_embd, bias=config.bias)
+        # disabled by default
+        self.kv_cache: Optional[KVCache] = None
+
+        self.config = config
+        self.block_idx = block_idx
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+        input_pos_maxp1: Optional[int] = None,
+    ) -> torch.Tensor:
+        # Notation:
+        # - B          | batch size
+        # - T          | time-step (sequence length)
+        # - C          | model's embeddings size (n_embd)
+        # - C*         | attentions's embeddings size
+        # - hs         | head size
+        # - nh_(q,k,v) | number of heads for query, key and value
+        # - n_query_groups = nh_k = nh_v | number of query groups sharing key and value heads
+        # alternative notation: num_kv_groups = n_query_groups
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+
+        q = self.q_b_proj(self.q_a_norm(self.q_a_proj(x)))  # (B, T, n_head * qk_head_dim)
+        q = q.view(B, T, -1, self.config.qk_head_dim)  # (B, T, n_head, qk_head_dim)
+        q = q.transpose(1, 2)  # (B, n_head, T, qk_head_dim)
+        q_pass, q_rot = torch.split(q, [self.config.qk_nope_head_dim, self.config.qk_rope_head_dim], dim=-1)
+
+        compressed_kv = self.kv_a_proj_with_mqa(x)  # (B, T, kv_lora_rank + qk_rope_head_dim)
+        k_pass, k_rot = torch.split(compressed_kv, [self.config.kv_lora_rank, self.config.qk_rope_head_dim], dim=-1)
+
+        k_pass = self.kv_b_proj(self.kv_a_norm(k_pass))
+        k_pass = k_pass.view(B, T, self.config.n_query_groups, -1)
+        k_pass = k_pass.transpose(1, 2)
+
+        k_pass, v = torch.split(k_pass, [self.config.qk_nope_head_dim, self.config.v_head_dim], dim=-1)
+        k_rot = k_rot.view(B, 1, T, self.config.qk_rope_head_dim)  # (B, 1, T, qk_rope_head_dim)
+
+        # Unlike standard positional embeddings rotary embeddings must be applied at every layer.
+        q_roped = apply_rope(q_rot, cos, sin)
+        k_roped = apply_rope(k_rot, cos, sin)
+        k_roped = k_roped.expand(*k_pass.shape[:-1], -1)  # (B, n_head, T, qk_rope_head_dim)
+
+        q = torch.cat((q_pass, q_roped), dim=-1)
+        k = torch.cat((k_pass, k_roped), dim=-1)
+
+        # Apply kv-cache during inference.
+        if input_pos is not None:
+            if not isinstance(self.kv_cache, KVCache):
+                raise TypeError("You need to call `gpt.set_kv_cache()`")
+            k, v = self.kv_cache(input_pos, k, v)
+            if input_pos_maxp1 is not None:
+                # Subselect along sequence dimension
+                k = k[..., :input_pos_maxp1, :]
+                v = v[..., :input_pos_maxp1, :]
+            # k, v: (B, nh_k, input_pos_maxp1, hs)
+            # If input_pos_maxp1 is None -> max_seq_length
+
+        # Grouped queries: balance the number of heads across all three matrices.
+        # NOTE: flash attention requires it in training mode.
+        # Multi-query: this step can be skipped since there is only 1 head, allowing us to use broadcasting.
+        if self.config.n_query_groups != self.config.n_head and (input_pos is None or self.config.n_query_groups != 1):
+            q_per_kv = self.config.n_head // self.config.n_query_groups
+            k = k.repeat_interleave(q_per_kv, dim=1)  # (B, nh_q, T, hs)
+            v = v.repeat_interleave(q_per_kv, dim=1)  # (B, nh_q, T, hs)
+
+        # Efficient attention using Flash Attention CUDA kernels.
+        # NOTE: efficient implementation is disabled if `mask` is not None or softcapping is enabled.
+        # ↓ (B, nh, T, hs) @ (B, nh, T, hs).mT --> (B, nh, T, T) @ (B, nh, T, hs) --> (B, nh, T, hs)
+        y = self.scaled_dot_product_attention(q, k, v, mask)
+
+        # Re-assemble all head outputs side by side.
+        y = y.reshape(B, T, self.config.n_head * self.config.v_head_dim)
+
+        # Output projection.
+        return self.proj(y)  # (B, T, C)
+
+    def scaled_dot_product_attention(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        scale = 1.0 / math.sqrt(self.config.attention_scores_scalar or self.config.qk_head_dim)
+
+        # with softcapping we cannot use SDPA
+        if self.config.attention_logit_softcapping is not None:
+            scores = q @ k.mT * scale
+            scores = do_softcapping(scores, self.config.attention_logit_softcapping)
+            if mask is None:
+                mask = torch.ones(q.size(2), q.size(2), dtype=q.dtype, device=q.device).triu(diagonal=1)
+                mask.masked_fill_(mask.bool(), torch.finfo(q.dtype).min)
+            scores = scores + mask
+            scores = F.softmax(scores, dim=-1, dtype=torch.float).to(dtype=q.dtype)
+            y = scores @ v
+        else:
+            y = F.scaled_dot_product_attention(
+                q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=mask is None
+            )
+        return y.transpose(1, 2)
+
+    def build_kv_cache(
+        self,
+        batch_size: int,
+        max_seq_length: int,
+        rope_cache_length: Optional[int] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "KVCache":
+        v_shape = (batch_size, self.config.n_head, max_seq_length, self.config.v_head_dim)
+        k_shape = (batch_size, self.config.n_head, max_seq_length, self.config.qk_head_dim)
+
+        if rope_cache_length is not None:
+            print("Warning: `rope_cache_length` has no effect on MultiheadLatentAttention!")
+        if self.config.rotary_percentage != 1.0:
+            print("Warning: `rotary_percentage` has no effect on MultiheadLatentAttention!")
+
+        return KVCache(k_shape, v_shape, device=device, dtype=dtype)
+
+
 class GptNeoxMLP(nn.Module):
     def __init__(self, config: Config, intermediate_size: Optional[int] = None) -> None:
         super().__init__()
diff --git a/tests/test_multihead_latent_attention.py b/tests/test_multihead_latent_attention.py
@@ -0,0 +1,150 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import pytest
+import torch
+from transformers.models.deepseek_v3 import DeepseekV3Config, DeepseekV3ForCausalLM
+
+from litgpt import Config
+from litgpt.model import MultiheadLatentAttention
+
+
+@torch.inference_mode()
+def test_multihead_latent_attention_kv_cache():
+    """Test KV cache functionality"""
+    config = Config(
+        block_size=32,
+        n_embd=64,
+        n_head=4,
+        n_query_groups=4,
+        head_size=16,
+        latent_attention={
+            "q_lora_rank": 32,
+            "kv_lora_rank": 16,
+            "qk_rope_head_dim": 8,
+            "qk_nope_head_dim": 8,
+            "v_head_dim": 16,
+        },
+    )
+
+    mla = MultiheadLatentAttention(config, block_idx=0)
+
+    # Build KV cache
+    kv_cache = mla.build_kv_cache(batch_size=2, max_seq_length=32, device=torch.device("cpu"), dtype=torch.float32)
+
+    # Check cache shapes
+    assert kv_cache.k.shape == (2, config.n_head, 32, config.qk_head_dim)
+    assert kv_cache.v.shape == (2, config.n_head, 32, config.v_head_dim)
+
+
+@torch.inference_mode()
+def test_multihead_latent_attention_with_mask():
+    """Test attention with causal mask"""
+    config = Config(
+        n_embd=64,
+        n_head=4,
+        n_query_groups=4,
+        head_size=16,
+        latent_attention={
+            "q_lora_rank": 32,
+            "kv_lora_rank": 16,
+            "qk_rope_head_dim": 8,
+            "qk_nope_head_dim": 8,
+            "v_head_dim": 16,
+        },
+    )
+
+    mla = MultiheadLatentAttention(config, block_idx=0)
+
+    batch_size, seq_len = 1, 8
+    x = torch.randn(batch_size, seq_len, config.n_embd)
+    cos = torch.randn(1, seq_len, config.qk_rope_head_dim)
+    sin = torch.randn(1, seq_len, config.qk_rope_head_dim)
+
+    # Create causal mask
+    mask = torch.ones(seq_len, seq_len, dtype=x.dtype).triu(diagonal=1)
+    mask.masked_fill_(mask.bool(), float("-inf"))
+    mask = mask.view(1, 1, seq_len, seq_len)
+
+    # Forward pass with mask
+    output = mla(x, cos, sin, mask=mask)
+
+    assert output.shape == (batch_size, seq_len, config.n_embd)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("batch_size", (1, 2))
+@pytest.mark.parametrize("seq_len", (8, 16))
+@pytest.mark.parametrize("device", [torch.device("cpu")])
+def test_multihead_latent_attention_litgpt_vs_hf(batch_size, seq_len, device):
+    """Test MLA litgpt vs hf"""
+    config_litgpt = Config(
+        n_embd=64,
+        n_head=4,
+        n_query_groups=4,
+        head_size=16,
+        norm_eps=1e-6,
+        bias=False,
+        latent_attention={
+            "q_lora_rank": 32,
+            "kv_lora_rank": 16,
+            "qk_rope_head_dim": 8,
+            "qk_nope_head_dim": 8,
+            "v_head_dim": 16,
+        },
+    )
+
+    config_hf = DeepseekV3Config(
+        padded_vocab_size=10000,
+        num_hidden_layers=1,
+        vocab_size=10000,
+        hidden_size=64,
+        num_attention_heads=4,
+        num_key_value_heads=4,
+        q_lora_rank=32,
+        kv_lora_rank=16,
+        qk_rope_head_dim=8,
+        qk_nope_head_dim=8,
+        v_head_dim=16,
+        rope_interleave=False,
+    )
+
+    mla_litgpt = MultiheadLatentAttention(config_litgpt, block_idx=0).to(device)
+    model_hf = DeepseekV3ForCausalLM(config_hf).to(device)
+    mla_hf = model_hf.model.layers[0].self_attn
+
+    mla_litgpt.eval()
+    mla_hf.eval()
+
+    sync_weights(mla_litgpt, mla_hf)
+
+    hidden_states = torch.randn(batch_size, seq_len, config_litgpt.n_embd, device=device)
+
+    # Prepare RoPE sin/cos tables
+    rope_head_dim = config_litgpt.latent_attention["qk_rope_head_dim"]
+    cos = torch.randn(batch_size, seq_len, rope_head_dim, device=device, dtype=hidden_states.dtype)
+    sin = torch.randn(batch_size, seq_len, rope_head_dim, device=device, dtype=hidden_states.dtype)
+
+    causal_mask = torch.triu(
+        torch.full((seq_len, seq_len), float("-inf"), device=device, dtype=hidden_states.dtype), diagonal=1
+    )
+    attention_mask = causal_mask.unsqueeze(0).unsqueeze(0).expand(batch_size, 1, -1, -1)
+
+    # Run forward passes
+    output_litgpt = mla_litgpt(hidden_states, cos, sin)
+    output_hf = mla_hf(hidden_states, position_embeddings=(cos, sin), attention_mask=attention_mask)[0]
+
+    assert torch.allclose(output_litgpt, output_hf, atol=1e-5)
+
+
+def sync_weights(litgpt_model, hf_model):
+    """Copies weights from lit-gpt model to HF model."""
+    print("Synchronizing weights...")
+    with torch.no_grad():
+        hf_model.q_a_proj.weight.copy_(litgpt_model.q_a_proj.weight)
+        hf_model.q_a_layernorm.weight.copy_(litgpt_model.q_a_norm.weight)
+        hf_model.q_b_proj.weight.copy_(litgpt_model.q_b_proj.weight)
+        hf_model.kv_a_proj_with_mqa.weight.copy_(litgpt_model.kv_a_proj_with_mqa.weight)
+        hf_model.kv_a_layernorm.weight.copy_(litgpt_model.kv_a_norm.weight)
+        hf_model.kv_b_proj.weight.copy_(litgpt_model.kv_b_proj.weight)
+        hf_model.o_proj.weight.copy_(litgpt_model.proj.weight)
+    print("Synchronization complete.")