Initial support for Qwen2.5-VL

turboderp · turboderp · commit cce6f95cd3cd · 2025-01-29T03:03:36.000+01:00
diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
@@ -356,7 +356,7 @@ class Params:
 
         # Qwen2-VL (2, 2.5)
 
-        if arch_string == "Qwen2VLForConditionalGeneration":
+        if arch_string in ["Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration"]:
             arch_recognized = True
             self.lm.layer_keys += \
                 layer_keys_llama_norms + \
@@ -368,27 +368,44 @@ class Params:
             self.lm.mrope = True
             self.lm.rope_freq_half = True
 
-            read_config["vision_config"].update({"model_type": "qwen2"})
             self.vt_prefix = "visual."
-            self.vt.keys.update({
-                "fused_qkv": ".attn.qkv",
-                "attn_o": ".attn.proj",
-                "mlp_gate": None,
-                "mlp_up": ".mlp.fc1",
-                "mlp_down": ".mlp.fc2",
-                "norm_1": ".norm1",
-                "norm_2": ".norm2",
-                "layers": "blocks",
-                "patch_conv": "patch_embed.proj",
-            })
-            self.vt.mlp_gate = False
+            if arch_string == "Qwen2VLForConditionalGeneration":
+                read_config["vision_config"].update({"model_type": "qwen2"})
+                self.vt.keys.update({
+                    "fused_qkv": ".attn.qkv",
+                    "attn_o": ".attn.proj",
+                    "mlp_gate": None,
+                    "mlp_up": ".mlp.fc1",
+                    "mlp_down": ".mlp.fc2",
+                    "norm_1": ".norm1",
+                    "norm_2": ".norm2",
+                    "layers": "blocks",
+                    "patch_conv": "patch_embed.proj",
+                })
+                self.vt.mlp_gate = False
+                self.vt.mlp_act_func = "quickgelu"
+                self.vt.norm = "layernorm"
+            elif arch_string == "Qwen2_5_VLForConditionalGeneration":
+                read_config["vision_config"].update({"model_type": "qwen2.5"})
+                self.vt.keys.update({
+                    "fused_qkv": ".attn.qkv",
+                    "attn_o": ".attn.proj",
+                    "mlp_gate": ".mlp.gate_proj",
+                    "mlp_up": ".mlp.up_proj",
+                    "mlp_down": ".mlp.down_proj",
+                    "norm_1": ".norm1",
+                    "norm_2": ".norm2",
+                    "layers": "blocks",
+                    "patch_conv": "patch_embed.proj",
+                })
+                self.vt.mlp_gate = True
+                self.vt.mlp_act_func = "silu"
+                self.vt.norm = "rmsnorm"
             self.vt.mlp_bias = True
             self.vt.attention_bias_qkv = True
             self.vt.attention_bias_o = True
             self.vt.vision_input_norm = False
             self.vt.vision_conv3d = True
-            self.vt.mlp_act_func = "quickgelu"
-            self.vt.norm = "layernorm"
 
             self.mmp_prefix = "visual.merger."
             self.mmp.keys.update({
diff --git a/exllamav2/attn.py b/exllamav2/attn.py
@@ -41,11 +41,11 @@
             print(" ## Warning: Flash Attention is installed but unsupported GPUs were detected.")
 
         if [2, 2, 1] <= flash_attn_ver < [2, 5, 7]:
-            from flash_attn import flash_attn_func
+            from flash_attn import flash_attn_func, flash_attn_varlen_func
             has_flash_attn = True
 
         if [2, 5, 7] <= flash_attn_ver:
-            from flash_attn import flash_attn_func, flash_attn_with_kvcache
+            from flash_attn import flash_attn_func, flash_attn_varlen_func, flash_attn_with_kvcache
             # import flash_attn_2_cuda as flash_attn_cuda
 
             signature = list(inspect.signature(flash_attn_func).parameters)
@@ -882,7 +882,9 @@ def _attn_torch(self, batch_size, q_len, q_states, k_states, v_states, attn_para
                 k_states = k_states[:, :, -self.sliding_window:, :]
                 v_states = v_states[:, :, -self.sliding_window:, :]
 
-            if attn_params.is_causal():
+            if self.layer_idx in attn_params.block_diag_layers:
+                attn_mask_lr = attn_params.get_block_diag_mask(q_states.device)
+            elif attn_params.is_causal():
                 attn_mask_lr = causal_lower_right(q_len, k_states.shape[2])
             else:
                 attn_mask_lr = attn_params.get_attn_mask(q_states.device)
@@ -904,7 +906,9 @@ def _attn_torch(self, batch_size, q_len, q_states, k_states, v_states, attn_para
             attn_weights = torch.matmul(q_states, k_states)
 
             attn_weights *= self.scaling
-            if causal:
+            if self.layer_idx in attn_params.block_diag_layers:
+                attn_mask = attn_params.get_block_diag_mask(attn_weights.device)
+            elif causal:
                 attn_mask = attn_params.get_attn_mask(attn_weights.device)
 
             if cfg.attn_logit_softcapping:
@@ -939,14 +943,30 @@ def _attn_flash(self, batch_size, q_len, q_states, k_states, v_states, attn_para
             if has_flash_attn_with_softcap:
                 flash_kwargs["softcap"] = cfg.attn_logit_softcapping
 
-        attn_output = flash_attn_func(
-            q_states,
-            k_states,
-            v_states,
-            causal = causal,
-            softmax_scale = self.scaling,
-            **flash_kwargs
-        )
+        if self.layer_idx in attn_params.block_diag_layers:
+            q_states = q_states.flatten(start_dim = 0, end_dim = 1)
+            k_states = k_states.flatten(start_dim = 0, end_dim = 1)
+            v_states = v_states.flatten(start_dim = 0, end_dim = 1)
+            max_seqlen = attn_params.get_cu_seqlens_max()
+            cu_seqlens = attn_params.get_cu_seqlens(self.device_idx)
+            attn_output = flash_attn_varlen_func(
+                q_states,
+                k_states,
+                v_states,
+                cu_seqlens,
+                cu_seqlens,
+                max_seqlen,
+                max_seqlen
+            )
+        else:
+            attn_output = flash_attn_func(
+                q_states,
+                k_states,
+                v_states,
+                causal = causal,
+                softmax_scale = self.scaling,
+                **flash_kwargs
+            )
         attn_output = attn_output.reshape((batch_size, q_len, self.num_attention_heads * self.head_dim))
         return attn_output
 
diff --git a/exllamav2/attn_params.py b/exllamav2/attn_params.py
@@ -21,6 +21,10 @@ class Params:
     alt_rope_embed_dict: dict | None
     rope_offsets: torch.Tensor | None
     non_causal_attn: bool
+    block_diag_layers: set
+    block_diag_mask: torch.Tensor | None
+    cu_seqlens: torch.Tensor | None
+    cu_seqlens_max: int | None
 
     def __init__(
         self,
@@ -66,6 +70,11 @@ def __init__(
         self.past_len_tp = None
         self.paged = paged
 
+        self.block_diag_layers = set()
+        self.block_diag_mask = None
+        self.cu_seqlens = None
+        self.cu_seqlens_max = None
+
     def is_causal(self) -> bool:
         return self.input_mask is None
 
@@ -164,6 +173,31 @@ def get_rope_offsets(self, device_idx: int) -> torch.Tensor | None:
             self.rope_offsets = safe_move_tensor(self.rope_offsets, device_idx, non_blocking = True)
         return self.rope_offsets
 
+    def get_cu_seqlens(self, device: int) -> torch.Tensor | None:
+        if self.cu_seqlens is None:
+            return None
+        if self.cu_seqlens.device.index != device:
+            self.cu_seqlens = safe_move_tensor(self.cu_seqlens, device, non_blocking = True)
+        return self.cu_seqlens
+
+    def get_cu_seqlens_max(self) -> torch.Tensor | None:
+        assert self.cu_seqlens is not None
+        if self.cu_seqlens_max is not None:
+            return self.cu_seqlens_max
+        self.cu_seqlens_max = (self.cu_seqlens[1:] - self.cu_seqlens[:-1]).max().item()
+        return self.cu_seqlens_max
+
+    def get_block_diag_mask(self, device: int) -> torch.Tensor | None:
+        if self.block_diag_mask is None:
+            csl = self.get_cu_seqlens(device)
+            if csl is None:
+                return None
+            positions = torch.arange(csl[-1], device = csl.device)
+            labels = torch.searchsorted(csl[1:], positions, right = True)
+            self.block_diag_mask = labels.unsqueeze(0) == labels.unsqueeze(1).repeat(self.batch_size)
+        if self.block_diag_mask.device.index != device:
+            self.block_diag_mask = safe_move_tensor(self.block_diag_mask, device, non_blocking = True)
+        return self.block_diag_mask
 
 
 class PagedParams(Params):
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -135,6 +135,7 @@ class ExLlamaV2Config:
     vision_num_key_value_groups: int | None
     vision_hidden_size: int | None
     vision_intermediate_size: int | None
+    vision_merger_intermediate_size: int | None
     vision_hidden_act: str | None
     vision_rope_theta: float | None
     vision_feature_layer: int | None
@@ -152,6 +153,8 @@ class ExLlamaV2Config:
     vision_max_pixels: int | None
     vision_temporal_patch_size: int | None
     vision_max_size: int | None
+    vision_fullatt_block_indexes: list | None
+    vision_window_size: int | None
 
     # Deprecated fields, kept for compatibiltiy
 
@@ -478,6 +481,8 @@ def check_keys(archparams, prefix):
 
         # TODO: Cleanup & refactor
 
+        self.vision_fullatt_block_indexes = None
+
         if self.vision_model_type is None:
             pass
 
@@ -495,6 +500,7 @@ def check_keys(archparams, prefix):
             self.vision_feature_layer = read(read_config, int, ["vision_feature_layer"], no_default)
             self.vision_num_layers = read(read_config, int, ["vision_config->num_hidden_layers"], 24)
             self.vision_intermediate_size = read(read_config, int, ["vision_config->intermediate_size"], self.hidden_size)
+            self.vision_merger_intermediate_size = self.vision_intermediate_size
 
             image_processor_type = read(read_prep_config, str, ["image_processor_type"], no_default)
             assert image_processor_type == "PixtralImageProcessor", \
@@ -511,10 +517,27 @@ def check_keys(archparams, prefix):
             self.vision_spatial_merge_size = 1
             self.vision_max_size = 16384
 
-        elif self.vision_model_type == "qwen2":
+        elif self.vision_model_type in ["qwen2", "qwen2.5"]:
+            image_processor_type = read(read_prep_config, str, ["image_processor_type"], no_default)
+            if self.vision_model_type == "qwen2":
+                self.vision_hidden_size = read(read_config, int, ["vision_config->embed_dim"], no_default)
+                mlp_ratio = read(read_config, int, ["vision_config->mlp_ratio"], None)
+                self.vision_intermediate_size = self.vision_hidden_size * mlp_ratio
+                self.vision_merger_intermediate_size = self.vision_intermediate_size
+                assert image_processor_type == "Qwen2VLImageProcessor", \
+                    f"Wrong image processor type: {image_processor_type}"
+                self.vision_window_size = None
+            elif self.vision_model_type == "qwen2.5":
+                self.vision_hidden_size = read(read_config, int, ["vision_config->hidden_size"], no_default)
+                self.vision_intermediate_size = read(read_config, int, ["vision_config->intermediate_size"], no_default)
+                self.vision_fullatt_block_indexes = read(read_config, list, ["vision_config->fullatt_block_indexes", None])
+                self.vision_window_size = read(read_config, int, ["vision_config->window_size", None])
+                assert image_processor_type == "Qwen2_5_VLImageProcessor", \
+                    f"Wrong image processor type: {image_processor_type}"
+                self.vision_merger_intermediate_size = 5120  # TODO: This doesn't seem to appear in the config anywhere?
+
             self.vision_num_attention_heads = read(read_config, int, ["vision_config->num_heads"], no_default)
             self.vision_num_key_value_heads = self.vision_num_attention_heads
-            self.vision_hidden_size = read(read_config, int, ["vision_config->embed_dim"], no_default)
             self.vision_head_dim = self.vision_hidden_size // self.vision_num_attention_heads
             self.vision_num_key_value_groups = 1
             self.vision_hidden_act = "quickgelu"
@@ -523,12 +546,7 @@ def check_keys(archparams, prefix):
             patch_size = read(read_config, int, ["vision_config->patch_size"], no_default)
             self.vision_rope_theta = read(read_config, int, ["vision_config->rope_theta"], 10000.0)
             self.vision_num_layers = read(read_config, int, ["vision_config->depth"], no_default)
-            mlp_ratio = read(read_config, int, ["vision_config->mlp_ratio"], no_default)
-            self.vision_intermediate_size = self.vision_hidden_size * mlp_ratio
 
-            image_processor_type = read(read_prep_config, str, ["image_processor_type"], no_default)
-            assert image_processor_type == "Qwen2VLImageProcessor", \
-                f"Wrong image processor type: {image_processor_type}"
             self.vision_image_mean = read(read_prep_config, list, ["image_mean"], no_default)
             self.vision_image_std = read(read_prep_config, list, ["image_std"], no_default)
             assert read(read_prep_config, int, ["patch_size"], no_default) == patch_size, \
diff --git a/exllamav2/mlp.py b/exllamav2/mlp.py
@@ -51,6 +51,7 @@ def __init__(
         out_features: int | None = None,
         interm_features: int | None = None,
         merge: int | None = None,
+        pad32: bool = True,
     ):
         super().__init__(model, key, archparams)
         cfg = self.model.config
@@ -98,8 +99,8 @@ def __init__(
             self.pre_layernorm = None
             self.post_layernorm = None
 
-        self.up_proj = ExLlamaV2Linear(model, key + km["mlp_up"], in_features, interm_features, ap.mlp_bias, f_key = f_key, f_beg = f_b, f_end = f_c)
-        self.down_proj = ExLlamaV2Linear(model, key + km["mlp_down"], interm_features, out_features, ap.mlp_bias, prescale = cfg.scale_depth)
+        self.up_proj = ExLlamaV2Linear(model, key + km["mlp_up"], in_features, interm_features, ap.mlp_bias, f_key = f_key, f_beg = f_b, f_end = f_c, pad32 = pad32)
+        self.down_proj = ExLlamaV2Linear(model, key + km["mlp_down"], interm_features, out_features, ap.mlp_bias, prescale = cfg.scale_depth, pad32 = pad32)
 
         self.submodules = [self.up_proj,
                            self.down_proj]
@@ -109,7 +110,7 @@ def __init__(
             self.submodules += [self.post_layernorm]
 
         if ap.mlp_gate:
-            self.gate_proj = ExLlamaV2Linear(model, key + km["mlp_gate"], in_features, interm_features, ap.mlp_bias, f_key = f_key, f_beg = f_a, f_end = f_b)
+            self.gate_proj = ExLlamaV2Linear(model, key + km["mlp_gate"], in_features, interm_features, ap.mlp_bias, f_key = f_key, f_beg = f_a, f_end = f_b, pad32 = pad32)
             self.submodules += [self.gate_proj]
         else:
             self.gate_proj = None
diff --git a/exllamav2/vlm/processor/pixtral.py b/exllamav2/vlm/processor/pixtral.py
@@ -44,7 +44,7 @@ def preprocess(
 
     image = image.transpose(2, 0, 1)
     image = torch.from_numpy(image).half()
-    return image, new_size
+    return image, new_size, None, None
 
 def postprocess(
     model: ExLlamaV2,
diff --git a/exllamav2/vlm/processor/qwen2.py b/exllamav2/vlm/processor/qwen2.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import torch
+import torch.nn.functional as F
 import numpy as np
 from PIL import Image
 from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
@@ -86,7 +87,7 @@ def preprocess(
 
     if mode == "image":
         image = torch.from_numpy(flatten_patches).half()
-        return image, new_size
+        return image, new_size, (grid_t, grid_h, grid_w), config.vision_spatial_patch_size ** 2
     else:
         video = torch.from_numpy(flatten_patches).half()
         return video, new_size, (grid_t, grid_h, grid_w), config.vision_spatial_patch_size ** 2
@@ -149,4 +150,51 @@ def position_embeddings(
     cos = cos.unsqueeze(1).repeat(1, 1, 2).contiguous()
     sin = sin.unsqueeze(1).repeat(1, 1, 2).contiguous()
 
-    return sin, cos
+    return sin, cos
+
+
+def get_window_index(grid_thw, config: ExLlamaV2Config):
+
+    window_index: list = []
+    cu_window_seqlens: list = [0]
+    window_index_id = 0
+    vit_merger_window_size = (
+        config.vision_window_size //
+        config.vision_spatial_merge_size //
+        config.vision_patch_size["height"]
+    )
+
+    for grid_t, grid_h, grid_w in grid_thw:
+        llm_grid_h, llm_grid_w = (
+            grid_h // config.vision_spatial_merge_size,
+            grid_w // config.vision_spatial_merge_size,
+        )
+        index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+        pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+        pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+        num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+        num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+        index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+        index_padded = index_padded.reshape(
+            grid_t,
+            num_windows_h,
+            vit_merger_window_size,
+            num_windows_w,
+            vit_merger_window_size,
+        )
+        index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+            grid_t,
+            num_windows_h * num_windows_w,
+            vit_merger_window_size,
+            vit_merger_window_size,
+        )
+        seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+        index_padded = index_padded.reshape(-1)
+        index_new = index_padded[index_padded != -100]
+        window_index.append(index_new + window_index_id)
+        cu_seqlens_tmp = seqlens.cumsum(0) * config.vision_spatial_merge_size**2 + cu_window_seqlens[-1]
+        cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+        window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+
+    window_index = torch.cat(window_index, dim  =0)
+    return window_index, cu_window_seqlens
diff --git a/exllamav2/vlm/vision_tower.py b/exllamav2/vlm/vision_tower.py