fix: replace sliding window configuration parameters to sliding windows indices (#1995)

k223kim · web-flow · commit 7789e8281e1b · 2025-04-03T15:42:33.000Z
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -3,7 +3,7 @@
 from copy import deepcopy
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Literal, Optional, Type, Union
+from typing import Any, List, Literal, Optional, Type, Union
 
 import torch
 import yaml
@@ -58,9 +58,7 @@ class Config:
     attn_bias: bool = False
     attention_scores_scalar: Optional[int] = None
     sliding_window_size: Optional[int] = None
-    sliding_window_layer_placing: Optional[Literal["all", "interleaved"]] = None
-    sliding_window_layer_stride: Optional[int] = None
-    sliding_window_offset: int = 0
+    sliding_window_indices: Optional[List] = None
     # if `attention_logit_softcapping` is used, cannot use optimized
     # `torch.nn.functional.scaled_dot_product_attention` (which implements
     # Flash attention), may result in higher memory and runtime footprint.
@@ -114,14 +112,11 @@ def __post_init__(self):
 
         self.rope_n_elem = int(self.rotary_percentage * self.head_size)
 
-        if self.sliding_window_size is not None:
-            self.sliding_window_layer_stride = (
-                (1 if (self.sliding_window_layer_placing is None or self.sliding_window_layer_placing == "all") else 2)
-                if self.sliding_window_layer_stride is None
-                else self.sliding_window_layer_stride
-            )
+        if self.sliding_window_size is not None and self.sliding_window_indices is None:
+            self.sliding_window_indices = [1] * self.n_layer
 
-        self.sliding_window_block_idx_map_fn = lambda x: x + self.sliding_window_offset
+        if self.rope_local_base_freq is not None and self.rope_indices is None:
+            self.rope_indices = [1] * self.n_layer
 
     @classmethod
     def from_name(cls, name: str, **kwargs: Any) -> Optional[Self]:
@@ -974,7 +969,7 @@ def norm_class(self) -> Type:
         block_size=8192,
         sliding_window_size=4096,
         # only layer with idx 0, 2, 4, ... have sliding window attention
-        sliding_window_layer_placing="interleaved",
+        sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(26)],
         intermediate_size=9216,
         n_embd=2304,
         n_layer=26,
@@ -1002,7 +997,7 @@ def norm_class(self) -> Type:
         block_size=8192,
         sliding_window_size=4096,
         # only layer with idx 0, 2, 4, ... have sliding window attention
-        sliding_window_layer_placing="interleaved",
+        sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(42)],
         intermediate_size=14336,
         n_embd=3584,
         n_layer=42,
@@ -1032,7 +1027,7 @@ def norm_class(self) -> Type:
         block_size=8192,
         sliding_window_size=4096,
         # only layer with idx 0, 2, 4, ... have sliding window attention
-        sliding_window_layer_placing="interleaved",
+        sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(46)],
         intermediate_size=36864,
         n_embd=4608,
         n_layer=46,
@@ -1549,7 +1544,6 @@ def norm_class(self) -> Type:
         mlp_class_name="LLaMAMLP",
         parallel_residual=False,
         sliding_window_size=2048,
-        sliding_window_layer_placing="all",
     ),
     # https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json
     dict(
@@ -1567,7 +1561,6 @@ def norm_class(self) -> Type:
         mlp_class_name="LLaMAMLP",
         parallel_residual=False,
         sliding_window_size=262145,
-        sliding_window_layer_placing="all",
     ),
     # https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/config.json
     dict(
@@ -1622,7 +1615,6 @@ def norm_class(self) -> Type:
         mlp_class_name="LLaMAMLP",
         parallel_residual=False,
         sliding_window_size=262145,
-        sliding_window_layer_placing="all",
     ),
 ]
 configs.extend(phi)
@@ -1649,7 +1641,6 @@ def norm_class(self) -> Type:
         mlp_class_name="LLaMAMLP",
         intermediate_size=14336,
         sliding_window_size=4096,
-        sliding_window_layer_placing="all",
     )
 )
 
@@ -1670,7 +1661,6 @@ def norm_class(self) -> Type:
         mlp_class_name="LLaMAMLP",
         intermediate_size=14336,
         sliding_window_size=4096,
-        sliding_window_layer_placing="all",
     ),
     # https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json
     dict(
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -324,10 +324,9 @@ def __init__(self, config: Config, block_idx: int) -> None:
         self.proj = nn.Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias)
         # disabled by default
         self.kv_cache: Optional[KVCache] = None
-        self.apply_sliding_window_attention = (
-            config.sliding_window_size is not None
-            and config.sliding_window_block_idx_map_fn(block_idx) % config.sliding_window_layer_stride == 0
-        )
+        self.apply_sliding_window_attention = False
+        if config.sliding_window_size is not None and config.sliding_window_indices is not None:
+            self.apply_sliding_window_attention = config.sliding_window_indices[block_idx]
 
         if config.norm_qk:
             self.norm_q = config.norm_class(config.head_size * config.n_head, eps=config.norm_eps)
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -420,7 +420,6 @@ def test_against_mistral_hf_models(device, dtype, model_name):
         padded_vocab_size=10000,
         block_size=T,
         sliding_window_size=T // 2,
-        sliding_window_layer_placing="all",
         n_layer=2,
         n_embd=32,
         n_head=8,