Support Cohere2 architecture

turboderp · turboderp · commit ebfefc4bed38 · 2024-12-25T20:14:45.000+01:00
diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
@@ -515,6 +515,28 @@ class Params:
             self.lm.parallel_decoder_blocks = True
             self.lm.requires_bos = True
 
+        # Cohere 2
+
+        if arch_string == "Cohere2ForCausalLM":
+            arch_recognized = True
+            self.lm.layer_keys += \
+                layer_keys_cohere_norms + \
+                layer_keys_llama_attn + \
+                layer_keys_llama_mlp
+            self.lm.expect_keys += \
+                expect_keys_gemma
+            self.lm.keys.update({
+                "norm_eps": "layer_norm_eps",
+                "lm_head": "model.embed_tokens",
+                "norm_1": ".input_layernorm",
+                "norm_2": None,
+            })
+            self.lm.norm = "layernorm"
+            self.lm.rope_style = RopeStyle.GPTJ
+            self.lm.parallel_decoder_blocks = True
+            self.lm.requires_bos = True
+            self.lm.alternating_swa = True
+
         # DBRX
 
         if arch_string == "DbrxForCausalLM":
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -115,6 +115,7 @@ class ExLlamaV2Config:
     final_logit_softcapping: float | None
     attn_logit_softcapping: float | None
     sliding_window: int
+    sliding_window_pattern: int
     norm_head: int | None
     l3_rope_factor: float | None
     l3_rope_low_freq_factor: float | None
@@ -347,6 +348,7 @@ def prepare(self, no_tensors: bool = False):
         self.original_max_seq_len = self.max_seq_len
 
         self.sliding_window = read(read_config, int, ["sliding_window", "sliding_window_size"], 0, opt_subkey = "text_config")
+        self.sliding_window_pattern = read(read_config, int, ["sliding_window_pattern"], 1)
 
         rs = read(read_config, dict, "rope_scaling", None)
         if rs:
diff --git a/exllamav2/model.py b/exllamav2/model.py
@@ -106,16 +106,18 @@ def __init__(
         for layer_idx in range(cfg.num_hidden_layers):
 
             layer_key = cfg.arch.lm_prefix + f"model.layers.{layer_idx}"
+
+            if cfg.arch.lm.alternating_swa:
+                swa = cfg.sliding_window if (layer_idx + 1) % cfg.sliding_window_pattern != 0 else 0
+            elif cfg.arch.lm.swa:
+                swa = cfg.sliding_window
+            else:
+                swa = 0
+
             if cfg.arch.lm.parallel_decoder_blocks:
-                pd = ExLlamaV2ParallelDecoder(self, layer_key, layer_idx)
+                pd = ExLlamaV2ParallelDecoder(self, layer_key, layer_idx, sliding_window = swa)
                 self.modules += [pd]
             else:
-                if cfg.arch.lm.alternating_swa:
-                    swa = cfg.sliding_window if not bool(layer_idx % 2) else 0
-                elif cfg.arch.lm.swa:
-                    swa = cfg.sliding_window
-                else:
-                    swa = 0
                 attn = ExLlamaV2Attention(self, layer_key, layer_idx, sliding_window = swa)
                 if cfg.arch.lm.is_moe: mlp = ExLlamaV2MoEMLP(self, layer_key, layer_idx)
                 else: mlp = ExLlamaV2MLP(self, layer_key, layer_idx)
diff --git a/exllamav2/parallel_decoder.py b/exllamav2/parallel_decoder.py
@@ -29,6 +29,7 @@ def __init__(
         model: ExLlamaV2,
         key: str,
         layer_idx: int,
+        sliding_window: int = 0,
         archparams = None
     ):
         super().__init__(model, key, archparams)
@@ -42,8 +43,21 @@ def __init__(
         elif self.archparams.norm == "rmsnorm":
             self.input_layernorm = ExLlamaV2RMSNorm(model, key + self.archparams.keys["norm_1"])
 
-        self.attn = ExLlamaV2Attention(model, key, layer_idx, has_norm = False, has_residual = False)
-        self.mlp = ExLlamaV2MLP(model, key, layer_idx, has_norm = False, has_residual = False)
+        self.attn = ExLlamaV2Attention(
+            model,
+            key,
+            layer_idx,
+            has_norm = False,
+            has_residual = False,
+            sliding_window = sliding_window
+        )
+        self.mlp = ExLlamaV2MLP(
+            model,
+            key,
+            layer_idx,
+            has_norm = False,
+            has_residual = False
+        )
 
         self.submodules = self.attn.submodules + self.mlp.submodules