Pixtral: Add vision tower and preprocessor

turboderp · turboderp · commit 193a6b2b3614 · 2024-11-10T11:15:06.000+01:00
diff --git a/exllamav2/__init__.py b/exllamav2/__init__.py
@@ -1,7 +1,6 @@
 from exllamav2.version import __version__
 
 from exllamav2.model import ExLlamaV2
-from exllamav2.vlm import ExLlamaV2MultimodalProjector
 from exllamav2.cache import ExLlamaV2CacheBase
 from exllamav2.cache import ExLlamaV2Cache
 from exllamav2.cache import ExLlamaV2Cache_Q4
@@ -15,3 +14,6 @@
 from exllamav2.util import SeqTensor
 from exllamav2.util import Timer
 from exllamav2.module import Intervention
+
+from exllamav2.vlm.mmprojector import ExLlamaV2MultimodalProjector
+from exllamav2.vlm.vision_tower import ExLlamaV2VisionTower
diff --git a/exllamav2/vlm/__init__.py b/exllamav2/vlm/__init__.py
@@ -1,3 +1,4 @@
 from exllamav2.version import __version__
 
-from exllamav2.vlm.mmprojector import ExLlamaV2MultimodalProjector
+from exllamav2.vlm.mmprojector import ExLlamaV2MultimodalProjector
+from exllamav2.vlm.vision_tower import ExLlamaV2VisionTower
diff --git a/exllamav2/vlm/mmprojector.py b/exllamav2/vlm/mmprojector.py
@@ -1,18 +1,15 @@
 
-import torch.nn as nn
-import torch.nn.functional as F
-
 from exllamav2 import ExLlamaV2
 from exllamav2.config import ExLlamaV2Config
 from exllamav2.module import ExLlamaV2Module
 from exllamav2.mlp import ExLlamaV2MLP
-from typing import Callable
 
 class ExLlamaV2MultimodalProjector(ExLlamaV2):
 
     config: ExLlamaV2Config
     modules: list[ExLlamaV2Module]
 
+    # noinspection PyMissingConstructor
     def __init__(
         self,
         config: ExLlamaV2Config
@@ -35,8 +32,18 @@ def __init__(
             )
         ]
 
+    # noinspection PyMethodOverriding
     def forward(self, x):
 
         for m in self.modules:
             x = m.forward(x)
         return x
+
+    def load_tp(self, **kwargs):
+        raise ValueError("load_tp not supported for multimodal projector")
+    def load_tp_gen(self, **kwargs):
+        raise ValueError("load_tp not supported for multimodal projector")
+    def load_autosplit(self, **kwargs):
+        raise ValueError("load_autosplit not supported for multimodal projector")
+    def load_autosplit_gen(self, **kwargs):
+        raise ValueError("load_autosplit not supported for multimodal projector")
diff --git a/exllamav2/vlm/preprocessor/pixtral.py b/exllamav2/vlm/preprocessor/pixtral.py
@@ -0,0 +1,44 @@
+import torch
+import numpy as np
+from PIL import Image
+from exllamav2.config import ExLlamaV2Config
+from exllamav2.vlm.util import (
+    convert_to_rgb,
+    size_to_longest_edge_and_patch_size,
+    normalize_image
+)
+
+def preprocess(
+    config: ExLlamaV2Config,
+    image: Image
+) -> torch.Tensor:
+
+    assert "longest_edge" in config.vision_size, \
+        "preprocessing size must specify longest_edge"
+
+    patch_size = tuple(config.vision_patch_size[d] for d in ["height", "width"])
+    longest_edge = config.vision_size["longest_edge"]
+    resample = Image.Resampling(config.vision_resample)
+    image_mean = tuple(config.vision_image_mean)
+    image_std = tuple(config.vision_image_std)
+    rescale_factor = config.vision_rescale_factor
+
+    # Convert to RGB and resize as necessary
+
+    image = convert_to_rgb(image)
+    old_size = image.size
+    new_size = size_to_longest_edge_and_patch_size(image.size, (longest_edge, longest_edge), patch_size)
+    if old_size != new_size:
+        image = image.resize(new_size, resample = resample)
+
+    # Convert to numpy array and normalize
+
+    image = np.array(image).astype(np.float32)
+    image = image * rescale_factor
+    image = normalize_image(image, image_mean, image_std)
+
+    # Convert to tensor, shape (3, resized_height, resized_width)
+
+    image = image.transpose(2, 0, 1)
+    image = torch.from_numpy(image).half()
+    return image
diff --git a/exllamav2/vlm/util.py b/exllamav2/vlm/util.py
@@ -0,0 +1,83 @@
+import torch
+import numpy as np
+from PIL import Image
+from typing import Tuple
+
+def convert_to_rgb(image: Image) -> Image:
+    """
+    Converts an image to RGB format and ensure any transparent regions are converted to white
+    """
+    if image.mode == "RGB":
+        return image
+
+    image = image.convert("RGBA")
+
+    new_image = Image.new("RGBA", image.size, "WHITE")
+    new_image.paste(image, (0, 0), image)
+    new_image = new_image.convert("RGB")
+    return new_image
+
+
+def size_to_longest_edge_and_patch_size(
+    input_size: tuple,
+    max_size: tuple,
+    patch_size: tuple,
+) -> tuple:
+    """
+    Compute the output size for resizing an image while maintaining aspect ratio and constraining to a
+    maximum bounding box while keeping each dimension a multiple of the corresponding patch dimension.
+    """
+
+    assert all(p % d == 0 for p, d in zip(max_size, patch_size)), \
+        "max_size must be a multiple of patch_size"
+
+    # Reduce to bounding box
+
+    ratio = max(input_size[0] / max_size[0], input_size[1] / max_size[1])
+    if ratio > 1:
+        output_size = tuple(int(np.ceil(d / ratio)) for d in input_size)
+    else:
+        output_size = input_size
+
+    # Align size to patch grid
+
+    output_size = tuple((((d + p - 1) // p) * p) for d, p in zip(output_size, patch_size))
+    return output_size
+
+def normalize_image(
+    image: np.ndarray,
+    mean: tuple,
+    std: tuple,
+) -> np.ndarray:
+    """
+    Normalizes RGB image in numpy format using the mean and standard deviation specified by `mean` and `std`:
+    image = (image - mean(image)) / std
+    """
+
+    assert len(mean) == 3 and len(std) == 3, \
+        "mean and std arguments must be 3D"
+
+    # Upcast image to float32 if it's not already a float type
+
+    if not np.issubdtype(image.dtype, np.floating):
+        image = image.astype(np.float32)
+
+    mean = np.array(mean, dtype = image.dtype)
+    std = np.array(std, dtype = image.dtype)
+    image = (image - mean) / std
+    return image
+
+
+def position_ids_in_meshgrid(
+    height: int,
+    width: int,
+    max_width: int
+):
+    """
+    Create flat position IDs tensor for grid of patches: id(row, col) = row * max_width + col
+    """
+
+    row_indices = torch.arange(height).unsqueeze(1) * max_width
+    col_indices = torch.arange(width).unsqueeze(0)
+    ids = row_indices + col_indices
+    return ids.flatten().unsqueeze(0)
diff --git a/exllamav2/vlm/vision_tower.py b/exllamav2/vlm/vision_tower.py
@@ -0,0 +1,152 @@
+from __future__ import annotations
+import os, sys
+
+import threading
+
+import torch
+from exllamav2 import ExLlamaV2
+from exllamav2.conv2d import ExLlamaV2Conv2D
+from exllamav2.rmsnorm import ExLlamaV2RMSNorm
+from exllamav2.attn import ExLlamaV2Attention
+from exllamav2.mlp import ExLlamaV2MLP
+from exllamav2.config import ExLlamaV2Config
+from exllamav2.module import ExLlamaV2Module
+from exllamav2.vlm.preprocessor import pixtral
+from exllamav2.compat import safe_move_tensor
+
+from PIL.Image import Image
+from exllamav2.vlm.util import position_ids_in_meshgrid
+
+class ExLlamaV2VisionTower(ExLlamaV2):
+
+    config: ExLlamaV2Config
+    modules: list[ExLlamaV2Module]
+
+    # noinspection PyMissingConstructor
+    def __init__(
+        self,
+        config: ExLlamaV2Config
+    ):
+        self.config = config
+        cfg = self.config
+        self.archparams = cfg.arch.vt
+        self.modules = []
+
+        # Preprocessor
+
+        if cfg.vision_model_type == "pixtral":
+            self.preprocessor = pixtral.preprocess
+        else:
+            raise ValueError(f"Unknown vision model type: {cfg.vision_model_type}")
+
+        # Position embeddings
+
+        self.p_maxedge = cfg.vision_size["longest_edge"] // cfg.vision_patch_size["width"]
+        freqs = 1.0 / (cfg.vision_rope_theta ** (torch.arange(0, cfg.vision_head_dim, 2).float() / cfg.vision_head_dim))
+        h = torch.arange(self.p_maxedge, device=freqs.device)
+        w = torch.arange(self.p_maxedge, device=freqs.device)
+        freqs_h = torch.outer(h, freqs[::2]).float()
+        freqs_w = torch.outer(w, freqs[1::2]).float()
+        inv_freq = torch.cat(
+            [
+                freqs_h[:, None, :].repeat(1, self.p_maxedge, 1),
+                freqs_w[None, :, :].repeat(self.p_maxedge, 1, 1),
+            ],
+            dim=-1,
+        ).reshape(-1, cfg.vision_head_dim // 2)
+        inv_freq = torch.cat((inv_freq, inv_freq), dim = -1)
+
+        self.rope_cos = inv_freq.cos().half()
+        self.rope_sin = inv_freq.sin().half()
+
+        # Patch embeddings
+
+        patch_size = tuple(config.vision_patch_size[x] for x in ["height", "width"])
+        patch_conv = ExLlamaV2Conv2D(
+            model = self,
+            key = cfg.arch.vt_prefix + "patch_conv",
+            in_channels = self.config.vision_num_channels,
+            out_channels = self.config.vision_hidden_size,
+            kernel_size = patch_size,
+            has_bias = self.archparams.patch_conv_bias,
+            archparams = self.archparams,
+        )
+        self.modules += [patch_conv]
+
+        # Input norm
+
+        norm = ExLlamaV2RMSNorm(
+            model = self,
+            key = cfg.arch.vt_prefix + "ln_pre",
+            archparams = self.archparams,
+        )
+        self.modules += [norm]
+
+        # Decoder layers
+
+        for layer_idx in range(self.config.vision_num_layers):
+            layer_key = cfg.arch.vt_prefix + f"transformer.layers.{layer_idx}"
+            attn = ExLlamaV2Attention(self, layer_key, layer_idx, archparams = self.archparams)
+            mlp = ExLlamaV2MLP(self, layer_key, layer_idx, archparams = self.archparams)
+            self.modules += [attn, mlp]
+
+
+    def forward(self, **kwargs):
+        raise NotImplementedError()
+
+
+    def preprocess(self, image: Image) -> torch.Tensor:
+        """
+        Preprocess image and prepare for vision tower
+        """
+        return self.preprocessor(self.config, image)
+
+
+    def process(
+        self,
+        hidden_states: torch.Tensor,
+        abort_event: threading.Event | None = None,
+        **kwargs
+    ):
+        cfg = self.config
+
+        if len(hidden_states.shape) == 3:
+            hidden_states = hidden_states.unsqueeze(0)
+
+        bsz, channels, height, width = hidden_states.shape
+
+        p_height = height // cfg.vision_patch_size["height"]
+        p_width = width // cfg.vision_patch_size["width"]
+        position_ids = position_ids_in_meshgrid(p_height, p_width, self.p_maxedge)
+
+        cos = self.rope_cos[position_ids]
+        sin = self.rope_sin[position_ids]
+        attn_params = ExLlamaV2Attention.Params(non_causal_attn = True)
+
+        device = self.modules[0].device_idx
+        for idx, module in enumerate(self.modules):
+
+            # Respect abort signal
+
+            if abort_event and abort_event.is_set():
+                return None, None
+
+            # Onward
+
+            n_device = module.device_idx
+            if n_device is not None and n_device != device and n_device >= 0:
+                hidden_states = safe_move_tensor(hidden_states, n_device, non_blocking = True)
+
+            if cos.device != hidden_states.device:
+                cos = safe_move_tensor(cos, hidden_states.device)
+                sin = safe_move_tensor(sin, hidden_states.device)
+
+            hidden_states = module.forward(
+                hidden_states,
+                attn_params = attn_params,
+                **kwargs | {
+                    "alt_rope_embedding": (cos, sin)
+                }
+            )
+
+        return hidden_states