Gemma3 local RoPE fixes

turboderp · turboderp · commit d471d44f018b · 2025-04-10T22:16:08.000+02:00
diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
@@ -169,6 +169,8 @@ class Params:
             swa = False
             alternating_swa = False
             sliding_rope_theta = None
+            sliding_rope_scale = None
+            pos_id_index = 0
 
             # Model only works with eager attention
             eager_attn_only = False
@@ -508,6 +510,7 @@ class Params:
             self.lm.alternating_swa = True
             self.lm.residual_stream_fp32 = True
             self.lm.sliding_rope_theta = 10000
+            self.lm.sliding_rope_scale = 1
             self.lm.default_vocab_size = 262208
             self.lm.default_rms_norm_eps = 1e-06
             self.lm.default_head_dim = 256
@@ -516,6 +519,7 @@ class Params:
             self.lm.default_use_qk_norm = True
             self.lm.default_sliding_window_pattern = 6
             self.lm.default_rope_theta = 1e6
+            self.lm.pos_id_index = 1
 
             self.vt_prefix = "vision_tower.vision_model."
             self.vt.keys.update({
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -100,6 +100,8 @@ class ExLlamaV2Config:
     vocab_size: int
     rotary_embedding_base: float
     rotary_embedding_base_alt: float | None
+    pos_id_index: int
+    scale_pos_emb_alt: float | None
     scale_long_factor: list[float] | None
     scale_short_factor: list[float] | None
     alt_rope_method: str | None
@@ -358,6 +360,8 @@ def prepare(self, no_tensors: bool = False):
         )
 
         self.rotary_embedding_base_alt = self.arch.lm.sliding_rope_theta
+        self.scale_pos_emb_alt = self.arch.lm.sliding_rope_scale
+        self.pos_id_index = self.arch.lm.pos_id_index
 
         self.max_seq_len = read(
             read_config,
@@ -373,11 +377,12 @@ def prepare(self, no_tensors: bool = False):
 
         self.partial_rotary_factor = read(read_config, float, "partial_rotary_factor", 1.0)
 
-        rs = read(read_config, dict, "rope_scaling", None)
+        rs = read(read_config, dict, ["rope_scaling", "text_config->rope_scaling"], None)
         if rs:
             scaling_type = rs.get("type", None)
             rope_type = rs.get("rope_type", None)
             assert not (scaling_type and rope_type), "rope_scaling key has both `type` and `rope_type` subkeys"
+            if not scaling_type: scaling_type = rope_type
             if scaling_type == "linear":
                 assert "factor" in rs, "'factor' missing from 'rope_scaling' config"
                 self.scale_pos_emb = rs.get("factor", 1.0)
@@ -394,7 +399,7 @@ def prepare(self, no_tensors: bool = False):
                 self.alt_rope_method = "yarn"
                 self.yarn_rope_factor = rs["factor"]
                 self.yarn_rope_original_max_position_embeddings = rs["original_max_position_embeddings"]
-            if rope_type == "llama3":
+            if scaling_type == "llama3":
                 self.alt_rope_method = "llama3"
                 self.l3_rope_factor = rs["factor"]
                 self.l3_rope_low_freq_factor = rs["low_freq_factor"]
diff --git a/exllamav2/device.py b/exllamav2/device.py
@@ -42,6 +42,8 @@ class ExLlamaV2DeviceContext:
 
     sin: list[torch.Tensor] | None
     cos: list[torch.Tensor] | None
+    local_sin: list[torch.Tensor] | None
+    local_cos: list[torch.Tensor] | None
 
     scratch: torch.Tensor | None
 
@@ -119,13 +121,15 @@ def prepare_sincos(self):
         cfg = self.model.config
 
         thetas = [cfg.rotary_embedding_base]
+        scales = [cfg.scale_pos_emb]
         if cfg.rotary_embedding_base_alt:
             thetas.append(cfg.rotary_embedding_base_alt)
+            scales.append(cfg.scale_pos_emb_alt)
 
         self.sin = []
         self.cos = []
 
-        for theta in thetas:
+        for theta, lscale in zip(thetas, scales):
 
             if self.archparams.rope_style == RopeStyle.NONE:
                 sin = torch.zeros((1,), device = device, dtype = torch.half)
@@ -140,8 +144,10 @@ def prepare_sincos(self):
 
             # Common
 
-            scale = cfg.scale_pos_emb or 1.0
+            scale = lscale or 1.0
             t = torch.arange(cfg.max_seq_len, device = device, dtype = torch.float32)
+            if cfg.pos_id_index != 0:
+                t += cfg.pos_id_index
             if scale != 1.0: t /= scale
 
             freqs = torch.einsum("i,j->ij", t, inv_freq)
diff --git a/exllamav2/model.py b/exllamav2/model.py
@@ -109,9 +109,12 @@ def __init__(
             rope_index = 0
 
             if cfg.arch.lm.alternating_swa:
-                swa = cfg.sliding_window if (layer_idx + 1) % cfg.sliding_window_pattern != 0 else 0
-                if cfg.rotary_embedding_base_alt:
-                    rope_index = 1
+                if cfg.sliding_window_pattern > 1:
+                    swa = cfg.sliding_window if (layer_idx + 1) % cfg.sliding_window_pattern != 0 else 0
+                    if cfg.rotary_embedding_base_alt:
+                        rope_index = 1
+                else:
+                    swa = cfg.sliding_window if not bool(layer_idx % 2) else 0
             elif cfg.arch.lm.swa:
                 swa = cfg.sliding_window
             else: