Add support for Mistral 3.1 VLM

turboderp · turboderp · commit 9244003a40ee · 2025-04-18T22:47:47.000+02:00
diff --git a/examples/multimodal.py b/examples/multimodal.py
@@ -26,6 +26,8 @@
 # Pixtral:
 #   https://huggingface.co/mistral-community/pixtral-12b/
 #   https://huggingface.co/turboderp/pixtral-12b-exl2
+# Mistral-Small 3.1:
+#   https://huggingface.co/prince-canuma/Mistral-Small-3.1-24B-Instruct-2503
 # Qwen2-VL:
 #   https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
 #   https://huggingface.co/turboderp/Qwen2-VL-7B-Instruct-exl2
@@ -34,18 +36,21 @@
 #   https://huggingface.co/turboderp/gemma-3-27b-it-exl2
 
 # mode = "pixtral"
+mode = "mistral3"
 # mode = "qwen2"
-mode = "gemma3"
+# mode = "gemma3"
 
 streaming = True
 greedy = True
 
 if mode == "pixtral":
     model_directory = "/mnt/str/models/pixtral-12b-exl2/6.0bpw"
 elif mode == "qwen2":
-    model_directory = "/mnt/str/models/qwen2.5-vl-7b-instruct-exl2/6.0bpw"
+    model_directory = "/mnt/str/models/qwen2.5-vl-7b-instruct-exl2/5.0bpw"
 elif mode == "gemma3":
-    model_directory = "/mnt/str/models/gemma3-27b-it-exl2/5.0bpw"
+    model_directory = "/mnt/str/models/gemma3-12b-it-exl2/6.0bpw"
+elif mode == "mistral3":
+    model_directory = "/mnt/str/models/mistral-small-3.1-24b-instruct/exl2/4.5bpw"
 
 images = [
     # {"file": "media/test_image_1.jpg"},
@@ -62,7 +67,7 @@
 # Initialize model
 
 config = ExLlamaV2Config(model_directory)
-config.max_seq_len = 16384  # Pixtral default is 1M
+config.max_seq_len = 8192  # Pixtral default is 1M
 
 # Load vision model and multimodal projector and initialize preprocessor
 
@@ -72,8 +77,8 @@
 # Load EXL2 model
 
 model = ExLlamaV2(config)
-cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
-model.load_autosplit(cache, progress = True)
+cache = ExLlamaV2Cache(model, max_seq_len = 8192, lazy = True)
+model.load_autosplit(progress = True, cache = cache)
 tokenizer = ExLlamaV2Tokenizer(config)
 
 # Create generator
@@ -121,7 +126,7 @@ def get_image(file = None, url = None):
 # Image token IDs are assigned sequentially, however, so two ExLlamaV2Embedding objects created from the same
 # source image will not be recognized as the same image for purposes of prompt caching etc.
 
-if mode == "pixtral":
+if mode in ["pixtral", "mistral3"]:
     prompt = (
         "[INST]" +
         placeholders +
diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
@@ -241,6 +241,8 @@ class Params:
             rope_freq_half: bool = False
             learned_emb: bool = False
             output_norm: bool = False
+            mlp_merger: bool = False
+            mlp_patch_merger: bool = False
 
         # Component models
         self.lm_prefix = ""
@@ -340,6 +342,52 @@ class Params:
             self.mmp.mlp_act_func = "gelu"
             self.mmp.mlp_bias = bool(read_config.get("multimodal_projector_bias", True))
 
+        # Mistral 3 multimodal
+
+        if (
+            arch_string == "Mistral3ForConditionalGeneration" and
+            "vision_config" in read_config and
+            read_config["vision_config"].get("model_type") == "pixtral"
+        ):
+            arch_recognized = True
+            self.lm_prefix = "language_model."
+            self.lm.layer_keys += \
+                layer_keys_llama_norms + \
+                layer_keys_llama_attn + \
+                layer_keys_llama_mlp
+            self.lm.expect_keys += \
+                expect_keys_llama
+
+            self.vt_prefix = "vision_tower."
+            self.vt.keys.update({
+                "attn_q": ".attention.q_proj",
+                "attn_k": ".attention.k_proj",
+                "attn_v": ".attention.v_proj",
+                "attn_o": ".attention.o_proj",
+                "mlp_gate": ".feed_forward.gate_proj",
+                "mlp_up": ".feed_forward.up_proj",
+                "mlp_down": ".feed_forward.down_proj",
+                "norm_1": ".attention_norm",
+                "norm_2": ".ffn_norm",
+                "layers": "transformer.layers",
+                "ln_pre": "ln_pre",
+            })
+            self.vt.mlp_merger = True
+            self.vt.mlp_patch_merger = True
+
+            self.mmp_prefix = "multi_modal_projector."
+            self.mmp.keys.update({
+                "norm_2": "norm",
+                "mlp_gate": None,
+                "mlp_up": "linear_1",
+                "mlp_down": "linear_2",
+                "patch_merger": "patch_merger.merging_layer",
+            })
+            self.mmp.mlp_patch_merger = True
+            self.mmp.mlp_gate = False
+            self.mmp.mlp_act_func = "gelu"
+            self.mmp.mlp_bias = bool(read_config.get("multimodal_projector_bias", True))
+
         # Yi
 
         if arch_string == "YiForCausalLM":
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -552,18 +552,20 @@ def check_keys(archparams, prefix):
             self.vision_merger_intermediate_size = self.vision_intermediate_size
 
             image_processor_type = read(read_prep_config, str, ["image_processor_type"], no_default)
-            assert image_processor_type == "PixtralImageProcessor", \
+            assert image_processor_type == "PixtralImageProcessor" or image_processor_type == "PixtralImageProcessorFast", \
                 f"Wrong image processor type: {image_processor_type}"
             self.vision_image_mean = read(read_prep_config, list, ["image_mean"], no_default)
             self.vision_image_std = read(read_prep_config, list, ["image_std"], no_default)
-            self.vision_patch_size = read(read_prep_config, dict, ["patch_size"], no_default)
+            self.vision_patch_size = read(read_prep_config, object, ["patch_size"], no_default)
+            if isinstance(self.vision_patch_size, int):
+                self.vision_patch_size = {"width": self.vision_patch_size, "height": self.vision_patch_size}
             assert all(self.vision_patch_size.get(x) == patch_size for x in ["width", "height"]), \
                 "Patch size inconsistency between config.json and preprocessor_config.json"
             self.vision_resample = read(read_prep_config, int, ["resample"], no_default)
             self.vision_rescale_factor = read(read_prep_config, float, ["rescale_factor"], no_default)
             self.vision_size = read(read_prep_config, dict, ["size"], no_default)
             self.vision_num_channels = 3
-            self.vision_spatial_merge_size = 1
+            self.vision_spatial_merge_size = read(read_config, int, ["spatial_merge_size"], 1)
             self.vision_max_size = 16384
             self.vision_window_size = None
 
diff --git a/exllamav2/mlp.py b/exllamav2/mlp.py
@@ -115,6 +115,11 @@ def __init__(
         else:
             self.gate_proj = None
 
+        if merge and ap.mlp_patch_merger:
+            self.patch_merger_proj = ExLlamaV2Linear(model, key + km["patch_merger"], in_features * merge**2, in_features, ap.mlp_bias)
+            self.submodules += [self.patch_merger_proj]
+        else:
+            self.patch_merger_proj = None
 
     def numel(self) -> int:
 
@@ -158,6 +163,9 @@ def load(
             if self.gate_proj is not None: self.gate_proj.load(device_context = device_context, output_map = down_map)
             self.up_proj.load(device_context = device_context, output_map = down_map)
 
+        if self.patch_merger_proj is not None:
+            self.patch_merger_proj.load()
+
         if self.up_proj.is_quant():
             assert self.gate_proj is None or self.gate_proj.is_quant()
             assert self.up_proj.is_quant(), "Partially quantized MLP layer"
@@ -302,6 +310,8 @@ def set_device_idx(self, idx: int | None):
         if self.gate_proj is not None: self.gate_proj.set_device_idx(idx)
         self.up_proj.set_device_idx(idx)
         self.down_proj.set_device_idx(idx)
+        if self.patch_merger_proj is not None:
+            self.patch_merger_proj.set_device_idx(idx)
 
 
     # @profile
@@ -458,9 +468,18 @@ def forward_torch(
             if self.pre_layernorm else hidden_states
 
         if self.merge:
-            bd = post_norm.shape[:-2]
-            l, d = post_norm.shape[-2:]
-            post_norm = post_norm.view(*bd, l // self.merge, d * self.merge)
+            if self.archparams.mlp_patch_merger:
+                bsz = hidden_states.shape[0]
+                assert bsz == 1
+                (h, w), d = kwargs["patch_size"], hidden_states.shape[-1]
+                image_grid = post_norm.view(h, w, d).permute(2, 0, 1).unsqueeze(0)
+                grid = F.unfold(image_grid, kernel_size = int(self.merge ** 0.5), stride = int(self.merge ** 0.5))
+                grid = grid.view(bsz, d * self.merge, -1).transpose(1, 2)
+                post_norm = self.patch_merger_proj.forward(grid)
+            else:
+                bd = post_norm.shape[:-2]
+                l, d = post_norm.shape[-2:]
+                post_norm = post_norm.view(*bd, l // self.merge, d * self.merge)
 
         if self.gate_proj is not None:
             gate = self.gate_proj.forward(post_norm, loras = loras)
diff --git a/exllamav2/vlm/processor/pixtral.py b/exllamav2/vlm/processor/pixtral.py
@@ -57,6 +57,9 @@ def postprocess(
     Insert [IMG_BREAK] and [IMG_END] tokens in image feature embeddings
     """
 
+    features_x //= model.config.vision_spatial_merge_size
+    features_y //= model.config.vision_spatial_merge_size
+
     assert embeddings.shape[0] == features_y * features_x, \
         "Invalid shape for embeddings"
 
diff --git a/exllamav2/vlm/vision_tower.py b/exllamav2/vlm/vision_tower.py
@@ -309,6 +309,7 @@ def process(
                 attn_params = attn_params,
                 **kwargs | ({
                     "alt_rope_embedding": (cos, sin),
+                    "patch_size": (p_height, p_width),
                 } if cos is not None else {})
             )
 

Original file line number	Diff line number	Diff line change
`@@ -309,6 +309,7 @@ def process(`
`309`	`309`	`attn_params = attn_params,`
`310`	`310`	`**kwargs \| ({`
`311`	`311`	`"alt_rope_embedding": (cos, sin),`
	`312`	`+ "patch_size": (p_height, p_width),`
`312`	`313`	`} if cos is not None else {})`
`313`	`314`	`)`
`314`	`315`