Support Index architecture

turboderp · turboderp · commit 3ffcc74d82ac · 2024-07-14T08:01:28.000+02:00
diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
@@ -569,6 +569,27 @@ def __init__(self, arch_string, read_config):
             self.rope_style = RopeStyle.NEOX
             self.fused_qkv_altpack = True
 
+        # Index
+
+        if arch_string == "IndexForCausalLM":
+            arch_recognized = True
+            self.layer_keys += \
+                layer_keys_llama_norms + \
+                layer_keys_llama_attn + \
+                layer_keys_llama_mlp
+            self.expect_keys += \
+                expect_keys_llama
+            self.norm_eps_key = "rms_norm_eps"
+            self.mlp_key_gate = ".mlp.gate_proj"
+            self.mlp_key_up = ".mlp.up_proj"
+            self.mlp_key_down = ".mlp.down_proj"
+            self.lm_head_key = "lm_head"
+            self.norm_key_1 = ".input_layernorm"
+            self.norm_key_2 = ".post_attention_layernorm"
+            self.mlp_act_func = "silu"
+            self.norm = "rmsnorm"
+            self.rope_style = RopeStyle.NEOX
+
         # Llama (default + fallback)
 
         if arch_string != "LlamaForCausalLM" and not arch_recognized:
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -104,6 +104,7 @@ class ExLlamaV2Config:
     final_logit_softcapping: float | None
     attn_logit_softcapping: float | None
     sliding_window: int
+    norm_head: int | None
 
     checkpoint_fused_mlp: bool
 
@@ -251,6 +252,10 @@ def prepare(self, no_tensors: bool = False):
         self.attn_logit_softcapping = read(read_config, float, "attn_logit_softcapping", None)
         self.final_logit_softcapping = read(read_config, float, "final_logit_softcapping", None)
 
+        # Normalize weights in head layer
+
+        self.norm_head = read(read_config, int, "norm_head", None)
+
         # Positional embeddings
 
         self.rotary_embedding_base = read(read_config, float, ["rope_theta", "attn_config->rope_theta"], 10000.0)
diff --git a/exllamav2/linear.py b/exllamav2/linear.py
@@ -54,7 +54,8 @@ def __init__(self,
                  f_beg: int = None,
                  f_end: int = None,
                  is_sub_module: bool = True,
-                 altpack_qkv: bool = False):
+                 altpack_qkv: bool = False,
+                 normalize_unq: bool = False):
         super().__init__(model, key)
 
         self.is_sub_module = is_sub_module
@@ -89,6 +90,7 @@ def __init__(self,
         self.altpack_qkv = altpack_qkv
 
         self.assumed_footprint = in_features * (out_features + self.padding) * 2 + 128
+        self.normalize_unq = normalize_unq
 
 
     @torch.inference_mode
@@ -125,6 +127,8 @@ def load(self,
 
         elif isinstance(w, nn.Parameter):
             assert not self.has_bias, self.key + " has no bias tensor but bias is expected"
+            if self.normalize_unq:
+                w = self.normalize(w)
             if self.padding > 0: w = nn.Parameter(F.pad(w.data, (0, 0, 0, self.padding)).contiguous())
             if not self.model.config.load_in_q4 or not ".layers." in self.key:
                 self.linear = nn.Linear(self.in_features, self.out_features, self.has_bias, device = "meta", dtype = torch.float16)
@@ -138,6 +142,8 @@ def load(self,
 
         elif isinstance(w, tuple):
             assert self.has_bias, self.key + " has bias tensor but bias is not expected"
+            if self.normalize_unq:
+                w = self.normalize(w[0]), w[1]
             ww = w[0]
             wb = w[1]
             if self.padding > 0:
@@ -154,6 +160,10 @@ def load(self,
                 self.fp16_bias = wb
 
 
+    def normalize(self, w: torch.Tensor):
+        return nn.functional.normalize(w)
+
+
     def matrix_shape(self):
 
         return self.in_features, self.out_features
diff --git a/exllamav2/model.py b/exllamav2/model.py
@@ -250,7 +250,8 @@ def __init__(self, config: ExLlamaV2Config, lazy_load = False):
                                False,
                                max_out_len = self.config.max_output_len,
                                prescale = self.config.logit_scale,
-                               is_sub_module = False)
+                               is_sub_module = False,
+                               normalize_unq = bool(self.config.norm_head))
         if self.config.arch.lm_head_key != "lm_head":
             head.alt_key = self.config.arch.lm_head_key
         self.modules += [head]