Fix quantization for Pixtral, copy vision tower tensors to quantized model

turboderp · turboderp · commit 90895967b115 · 2024-11-10T16:22:57.000+01:00
diff --git a/exllamav2/conversion/compile.py b/exllamav2/conversion/compile.py
@@ -59,86 +59,106 @@ def get_q_module(job, module):
 @torch.inference_mode()
 def compile_model(job, save_fn, model):
 
+    cfg = model.config
     out_dict = {}
     current_size = 0
     file_index = 1
     index = 0
     shard_bytes = job["shard_size"] * 1024 ** 2
 
-    while index < len(model.modules):
+    extra_tensors = []
+    if cfg.arch.mmp_prefix:
+        extra_tensors += [k for k in cfg.tensor_file_map.keys() if k.startswith(cfg.arch.mmp_prefix)]
+    if cfg.arch.vt_prefix:
+        extra_tensors += [k for k in cfg.tensor_file_map.keys() if k.startswith(cfg.arch.vt_prefix)]
+    extra_tensors_size = 0
 
-        module = model.modules[index]
+    while index < len(model.modules) or len(extra_tensors):
 
-        if isinstance(module, ExLlamaV2Embedding):
+        if index < len(model.modules):
 
-            d = get_f_module(job, module); out_dict.update(d); current_size += _dsize(d)
+            module = model.modules[index]
 
-        if isinstance(module, ExLlamaV2PosEmbedding):
+            if isinstance(module, ExLlamaV2Embedding):
 
-            d = get_f_module(job, module); out_dict.update(d); current_size += _dsize(d)
+                d = get_f_module(job, module); out_dict.update(d); current_size += _dsize(d)
 
-        if isinstance(module, ExLlamaV2Attention):
+            if isinstance(module, ExLlamaV2PosEmbedding):
 
-            d = get_f_module(job, module.pre_layernorm)
-            if d: out_dict.update(d); current_size += _dsize(d)
-            d = get_f_module(job, module.post_layernorm)
-            if d: out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.q_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.k_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.v_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.o_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_f_module(job, module); out_dict.update(d); current_size += _dsize(d)
 
-        if isinstance(module, ExLlamaV2MLP):
+            if isinstance(module, ExLlamaV2Attention):
 
-            has_gate = model.config.arch.lm.mlp_gate
-            d = get_f_module(job, module.pre_layernorm)
-            if d: out_dict.update(d); current_size += _dsize(d)
-            d = get_f_module(job, module.post_layernorm)
-            if d: out_dict.update(d); current_size += _dsize(d)
-            if has_gate: d = get_q_module(job, module.gate_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.up_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.down_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_f_module(job, module.pre_layernorm)
+                if d: out_dict.update(d); current_size += _dsize(d)
+                d = get_f_module(job, module.post_layernorm)
+                if d: out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.q_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.k_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.v_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.o_proj); out_dict.update(d); current_size += _dsize(d)
 
-        if isinstance(module, ExLlamaV2MoEMLP):
+            if isinstance(module, ExLlamaV2MLP):
 
-            d = get_f_module(job, module.post_attention_layernorm); out_dict.update(d); current_size += _dsize(d)
-            d = get_f_module(job, module.gate); out_dict.update(d); current_size += _dsize(d)
-            for i in range(model.config.num_experts):
-                d = get_q_module(job, module.w1[i]); out_dict.update(d); current_size += _dsize(d)
-                d = get_q_module(job, module.w3[i]); out_dict.update(d); current_size += _dsize(d)
-                d = get_q_module(job, module.w2[i]); out_dict.update(d); current_size += _dsize(d)
+                has_gate = model.config.arch.lm.mlp_gate
+                d = get_f_module(job, module.pre_layernorm)
+                if d: out_dict.update(d); current_size += _dsize(d)
+                d = get_f_module(job, module.post_layernorm)
+                if d: out_dict.update(d); current_size += _dsize(d)
+                if has_gate: d = get_q_module(job, module.gate_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.up_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.down_proj); out_dict.update(d); current_size += _dsize(d)
 
-        if isinstance(module, ExLlamaV2ParallelDecoder):
+            if isinstance(module, ExLlamaV2MoEMLP):
 
-            has_gate = model.config.arch.lm.mlp_gate
-            has_qk_norm = model.config.use_qk_norm
-            d = get_f_module(job, module.input_layernorm); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.attn.q_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.attn.k_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.attn.v_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.attn.o_proj); out_dict.update(d); current_size += _dsize(d)
-            if has_qk_norm:
-                d = get_f_module(job, module.attn.q_norm); out_dict.update(d); current_size += _dsize(d)
-                d = get_f_module(job, module.attn.k_norm); out_dict.update(d); current_size += _dsize(d)
-            if has_gate:
-                d = get_q_module(job, module.mlp.gate_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.mlp.up_proj); out_dict.update(d); current_size += _dsize(d)
-            d = get_q_module(job, module.mlp.down_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_f_module(job, module.post_attention_layernorm); out_dict.update(d); current_size += _dsize(d)
+                d = get_f_module(job, module.gate); out_dict.update(d); current_size += _dsize(d)
+                for i in range(model.config.num_experts):
+                    d = get_q_module(job, module.w1[i]); out_dict.update(d); current_size += _dsize(d)
+                    d = get_q_module(job, module.w3[i]); out_dict.update(d); current_size += _dsize(d)
+                    d = get_q_module(job, module.w2[i]); out_dict.update(d); current_size += _dsize(d)
 
-        if isinstance(module, ExLlamaV2RMSNorm) or isinstance(module, ExLlamaV2LayerNorm):
+            if isinstance(module, ExLlamaV2ParallelDecoder):
 
-            d = get_f_module(job, module); out_dict.update(d); current_size += _dsize(d)
+                has_gate = model.config.arch.lm.mlp_gate
+                has_qk_norm = model.config.use_qk_norm
+                d = get_f_module(job, module.input_layernorm); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.attn.q_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.attn.k_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.attn.v_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.attn.o_proj); out_dict.update(d); current_size += _dsize(d)
+                if has_qk_norm:
+                    d = get_f_module(job, module.attn.q_norm); out_dict.update(d); current_size += _dsize(d)
+                    d = get_f_module(job, module.attn.k_norm); out_dict.update(d); current_size += _dsize(d)
+                if has_gate:
+                    d = get_q_module(job, module.mlp.gate_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.mlp.up_proj); out_dict.update(d); current_size += _dsize(d)
+                d = get_q_module(job, module.mlp.down_proj); out_dict.update(d); current_size += _dsize(d)
 
-        if isinstance(module, ExLlamaV2Linear):
+            if isinstance(module, ExLlamaV2RMSNorm) or isinstance(module, ExLlamaV2LayerNorm):
 
-            assert module.key == "lm_head"
-            d = get_q_module(job, module); out_dict.update(d); current_size += _dsize(d)
+                d = get_f_module(job, module); out_dict.update(d); current_size += _dsize(d)
 
-        index += 1
+            if isinstance(module, ExLlamaV2Linear):
+
+                assert module.key == cfg.arch.lm_prefix + "lm_head"
+                d = get_q_module(job, module); out_dict.update(d); current_size += _dsize(d)
+
+            index += 1
+
+        else:
+
+            key = extra_tensors[0]
+            extra_tensors = extra_tensors[1:]
+            file = cfg.tensor_file_map[key]
+            with safe_open(file, framework = "pt") as f:
+                tensor = f.get_tensor(key)
+            out_dict.update({key: tensor})
+            extra_tensors_size += _tsize(tensor)
 
         # Save shard
 
-        if current_size > shard_bytes or index == len(model.modules):
+        if current_size > shard_bytes or (index == len(model.modules) and len(extra_tensors) == 0):
 
             print_stage(job, "Compiling", index, len(model.modules))
 
@@ -175,7 +195,7 @@ def compile_model(job, save_fn, model):
 
                 out_dict = dont_save_dict
 
-                if index == len(model.modules) and len(out_dict) > 0:
+                if index == len(model.modules) and len(extra_tensors) == 0 and len(out_dict) > 0:
                     save_dict = dont_save_dict
                     dont_save_dict = {}
                     continue
@@ -203,6 +223,9 @@ def compile_model(job, save_fn, model):
             filesize = os.path.getsize(final_filename) // (1024 ** 2)
             print(f" --   {final_filename} ({filesize:,} MB)")
 
+    if extra_tensors_size:
+        print(f" -- Tensors copied (MM components): {extra_tensors_size // (1024 ** 2):,} MB")
+
     # Copy all non-tensor files from the model's directory if compiling a full model
 
     if job["compile_full"] is not None:
diff --git a/exllamav2/conversion/convert_exl2.py b/exllamav2/conversion/convert_exl2.py
@@ -149,7 +149,7 @@ def save_job():
         sys.exit()
 
     if job["progress"] == "finished":
-        print(" !! Job is already finished")
+        print(f" !! Job is already finished. Clear the working directory, or run this script with -nr/--no_resume to clear it automatically.")
         sys.exit()
 
 # Feedback
diff --git a/exllamav2/conversion/optimize.py b/exllamav2/conversion/optimize.py
@@ -8,11 +8,12 @@
 def optimize(job, save_fn, model):
 
     cfg = model.config
+    km = cfg.arch.lm.keys
 
     has_gate = cfg.arch.lm.mlp_gate
-    if has_gate: mlp_key_gate = cfg.arch.mlp_key_gate
-    mlp_key_up = cfg.arch.mlp_key_up
-    mlp_key_down = cfg.arch.mlp_key_down
+    if has_gate: mlp_key_gate = km["mlp_gate"]
+    mlp_key_up = km["mlp_up"]
+    mlp_key_down = km["mlp_down"]
 
     norm_interval = (1.5, 3.5)
     norm_2ndstage = 0.15
@@ -24,19 +25,19 @@ def optimize(job, save_fn, model):
     anneal_stages = 3
 
     first_q_layer = 0
-    while not model.modules[first_q_layer].key.startswith("model.layers"):
+    while not model.modules[first_q_layer].key.startswith(cfg.arch.lm_prefix + "model.layers"):
         first_q_layer += 1
 
     # max_step_size = 2
     # first_layer_bias = 4
     # bias_layers = 2
     # bias_iter = 0
 
-    key = "model.layers.0"
-    key_q = key + ".self_attn.q_proj"
-    key_k = key + ".self_attn.k_proj"
-    key_v = key + ".self_attn.v_proj"
-    key_o = key + ".self_attn.o_proj"
+    key = cfg.arch.lm_prefix + "model.layers.0"
+    key_q = key + km["attn_q"]
+    key_k = key + km["attn_k"]
+    key_v = key + km["attn_v"]
+    key_o = key + km["attn_o"]
 
     if not cfg.arch.lm.is_moe:
         if has_gate: key_g = key + mlp_key_gate
@@ -84,11 +85,11 @@ def optimize(job, save_fn, model):
 
     for i in range(num_layers):
         if cfg.arch.lm.parallel_decoder_blocks:
-            m1 = measurement["model.layers." + str(i) + ".parallel_decoder"]["attn"]
-            m2 = measurement["model.layers." + str(i) + ".parallel_decoder"]["mlp"]
+            m1 = measurement[cfg.arch.lm_prefix + "model.layers." + str(i) + ".parallel_decoder"]["attn"]
+            m2 = measurement[cfg.arch.lm_prefix + "model.layers." + str(i) + ".parallel_decoder"]["mlp"]
         else:
-            m1 = measurement["model.layers." + str(i) + ".self_attn"]
-            m2 = measurement["model.layers." + str(i) + "." + mlp_mode]
+            m1 = measurement[cfg.arch.lm_prefix + "model.layers." + str(i) + ".self_attn"]
+            m2 = measurement[cfg.arch.lm_prefix + "model.layers." + str(i) + "." + mlp_mode]
         for m in [m1, m2]:
             slot = []
             param = []
@@ -154,8 +155,8 @@ def optimize(job, save_fn, model):
     job["strategy"] = {}
     for layer_ in range(num_layers):
 
-        k1 = "model.layers." + str(layer_) + ".self_attn"
-        k2 = "model.layers." + str(layer_) + "." + mlp_mode
+        k1 = cfg.arch.lm_prefix + "model.layers." + str(layer_) + ".self_attn"
+        k2 = cfg.arch.lm_prefix + "model.layers." + str(layer_) + "." + mlp_mode
         p1 = params[layer_ * 2][solution_idx[layer_ * 2]]
         p2 = params[layer_ * 2 + 1][solution_idx[layer_ * 2 + 1]]
 
diff --git a/exllamav2/conversion/quantize.py b/exllamav2/conversion/quantize.py
@@ -326,7 +326,7 @@ def quant(job, save_fn, model):
 
         elif isinstance(module, ExLlamaV2Linear):
             mode = "linear"
-            assert module.key == "lm_head"
+            assert module.key == model.config.arch.lm_prefix + "lm_head"
             quantizers["lm_head"] = AdaptiveGPTQ(module.linear)
 
         elif isinstance(module, ExLlamaV2RMSNorm) or isinstance(module, ExLlamaV2LayerNorm):
diff --git a/exllamav2/mlp.py b/exllamav2/mlp.py
@@ -117,7 +117,7 @@ def numel(self) -> int:
         numel = self.up_proj.numel() + \
                 self.down_proj.numel()
 
-        if self.archparams.arch.mlp_gate:
+        if self.archparams.mlp_gate:
             numel += self.gate_proj.numel()
 
         if self.pre_layernorm is not None:
diff --git a/experimental/multimodal_pixtral_hf.py b/experimental/multimodal_pixtral_hf.py
@@ -23,7 +23,7 @@
 #
 # https://huggingface.co/mistral-community/pixtral-12b/
 
-model_directory = "/mnt/str/models/pixtral-12b"
+model_directory = "/mnt/str/models/pixtral-12b-exl2/5.0bpw"
 config = ExLlamaV2Config(model_directory)
 config.max_seq_len = 16384  # default is 1M
 

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`#`
`24`	`24`	`# https://huggingface.co/mistral-community/pixtral-12b/`
`25`	`25`
`26`		`-model_directory = "/mnt/str/models/pixtral-12b"`
	`26`	`+model_directory = "/mnt/str/models/pixtral-12b-exl2/5.0bpw"`
`27`	`27`	`config = ExLlamaV2Config(model_directory)`
`28`	`28`	`config.max_seq_len = 16384 # default is 1M`
`29`	`29`