feat: support linear scaled rope for tgis_native llama (#61)

joerunde · tjohnson31415 · web-flow · commit a5ac3741759b · 2024-03-18T17:26:59.000-06:00
Implements a new LinearScalingPositionRotaryEmbedding layer that
supports linear scaling of position ids when processing embeddings.
Without this, models with a linear rope_scaling configuration could load
fine but would give garbage output.

Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
Co-authored-by: TRAVIS JOHNSON &lt;tsjohnso@us.ibm.com&gt;
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -35,6 +35,7 @@
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     PositionRotaryEmbedding,
+    LinearScalingPositionRotaryEmbedding,
     TensorParallelHead,
     get_linear,
 )
@@ -183,12 +184,22 @@ def __init__(
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_heads
 
-        # self.rotary_emb = PositionRotaryEmbedding.load(
-        #     prefix=f"{prefix}.rotary_emb", weights=weights
-        # )
-        self.rotary_emb = PositionRotaryEmbedding.static(
-            dim=self.head_size, base=config.rope_theta, device=weights.device
-        )
+        if config.rope_scaling and "type" in config.rope_scaling:
+            if config.rope_scaling["type"] == "linear":
+                self.rotary_emb = LinearScalingPositionRotaryEmbedding.static(
+                    dim=self.head_size,
+                    base=config.rope_theta,
+                    scaling_factor=config.rope_scaling.get("factor", 1.0),
+                    device=weights.device
+                )
+            else:
+                raise ValueError(
+                    f"rope_scaling of type f{config.rope_scaling.type} is not supported with FLASH_ATTENTION=True"
+                )
+        else:
+            self.rotary_emb = PositionRotaryEmbedding.static(
+                dim=self.head_size, base=config.rope_theta, device=weights.device
+            )
 
         self.softmax_scale = self.head_size**-0.5
 
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
@@ -389,24 +389,25 @@ def forward(self, hidden_states, residual=None):
     from flash_attn.layers.rotary import RotaryEmbedding
     import rotary_emb
 
-    class PositionRotaryEmbedding(nn.Module):
-        def __init__(self, inv_freq):
+    class BasePositionRotaryEmbedding(nn.Module):
+        def __init__(self, inv_freq, scaling_factor=1.0):
             super().__init__()
 
             self.inv_freq = inv_freq
+            self.scaling_factor = scaling_factor
             self._seq_len_cached = 0
             self._cos_cached = None
             self._sin_cached = None
             self._cos_k_cached = None
             self._sin_k_cached = None
 
         @classmethod
-        def static(cls, dim, base, device):
+        def static(cls, dim, base, device, scaling_factor=1.0):
             inv_freq = 1.0 / (
                 base
                 ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
             )
-            return cls(inv_freq)
+            return cls(inv_freq, scaling_factor)
 
         @classmethod
         def load(cls, prefix, weights):
@@ -427,6 +428,8 @@ def _update_cos_sin_cache(self, dtype, device, seqlen):
             ):
                 self._seq_len_cached = seqlen
                 t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                if self.scaling_factor != 1.0:
+                    t = t / self.scaling_factor
                 # Don't do einsum, it converts fp32 to fp16
                 # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
                 freqs = torch.outer(t, self.inv_freq.to(device=t.device))
@@ -454,5 +457,23 @@ def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
             rotary_emb.apply_rotary(x1, x2, cos, sin, x1, x2, False)
             return x
 
+    class PositionRotaryEmbedding(BasePositionRotaryEmbedding):
+        @classmethod
+        def static(cls, dim, base, device):
+            inv_freq = 1.0 / (
+                base
+                ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
+            )
+            return cls(inv_freq)
+
+    class LinearScalingPositionRotaryEmbedding(BasePositionRotaryEmbedding):
+        @classmethod
+        def static(cls, dim, base, scaling_factor, device):
+            inv_freq = 1.0 / (
+                base
+                ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
+            )
+            return cls(inv_freq, scaling_factor)
+
 except ImportError:
     pass