feat: add local base freq for rope (#1993)

k223kim · pre-commit-ci[bot] · web-flow · commit 45d7ca9640ef · 2025-04-03T12:45:38.000+02:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -81,6 +81,9 @@ class Config:
     scale_embeddings: bool = False
     lm_head_bias: bool = False
     final_logit_softcapping: Optional[float] = None
+    # The base period of the RoPE embeddings for local attention.
+    # If not provided, rope_theta will be used for both local and global attention.
+    rope_local_base_freq: Optional[float] = None
 
     def __post_init__(self):
         if not self.name:
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -203,6 +203,7 @@ def rope_cache(self, device: Optional[torch.device] = None) -> Tuple[torch.Tenso
             condense_ratio=self.config.rope_condense_ratio,
             base=self.config.rope_base,
             extra_config=extra_config,
+            rope_local_base_freq=self.config.rope_local_base_freq,
         )
 
     def set_kv_cache(
@@ -567,6 +568,7 @@ def build_rope_cache(
     base: int = 10000,
     condense_ratio: int = 1,
     extra_config: Optional[dict] = None,
+    rope_local_base_freq: Optional[float] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Enhanced Transformer with Rotary Position Embedding.
@@ -620,6 +622,17 @@ def build_rope_cache(
     if idx_theta.shape[-1] > n_elem > 1:
         idx_theta = idx_theta[..., :n_elem]
 
+    # if rope_local_base_freq is given, have a separate rope value for local embedding
+    # For now, we use default RoPE for local embedding
+    if rope_local_base_freq is not None:
+        local_theta = 1.0 / (rope_local_base_freq ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
+        local_idx_theta = torch.outer(seq_idx, local_theta)
+        local_idx_theta = local_idx_theta.repeat(1, 2)
+        if local_idx_theta.shape[-1] > n_elem > 1:
+            local_idx_theta = local_idx_theta[..., :n_elem]
+
+        idx_theta = torch.stack((idx_theta, local_idx_theta), dim=-1)
+
     return torch.cos(idx_theta), torch.sin(idx_theta)