qdrant · I8dNLo · Nov 8, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 20, 2024
diff --git a/fastembed/image/colpali_model.py b/fastembed/image/colpali_model.py
@@ -0,0 +1,67 @@
+import contextlib
+from typing import Any, Dict, Iterable, List
+
+import numpy as np
+from PIL import Image
+
+from fastembed.common import ImageInput
+from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.image.onnx_embedding import OnnxImageEmbedding
+
+supported_onnx_models = [
+    {
+        "model": "akshayballal/colpali-v1.2-merged",
+        "dim": (1030, 128),
+        "description": "Image embeddings, Unimodal (image), Aligned to text latent space via PaliGemma-3B, 512 patches max, 2024.",
+        "license": "mit",
+        "size_in_GB": 6.08,
+        "sources": {
+            "hf": "akshayballal/colpali-v1.2-merged-onnx",
+        },
+        "additional_files": ["model.onnx_data"],
+        "model_file": "model.onnx",
+    }
+]
+
+
+class ColpaliImageModel(OnnxImageEmbedding):
+    empty_text_placeholder = np.array([257152] * 1024 + [2, 50721, 573, 2416, 235265, 108])
+    even_attention_mask = np.array([1] * 1030)
+
+    def _preprocess_onnx_input(
+        self, onnx_input: Dict[str, np.ndarray], **kwargs
+    ) -> Dict[str, np.ndarray]:
+        onnx_input["input_ids"] = np.array(
+            [self.empty_text_placeholder for _ in onnx_input["input_ids"]]
+        )
+        onnx_input["attention_mask"] = np.array(
+            [self.even_attention_mask for _ in onnx_input["input_ids"]]
+        )
+        return onnx_input
+
+    @classmethod
+    def list_supported_models(cls) -> List[Dict[str, Any]]:
+        """
+        Lists the supported models.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries containing the model information.
+        """
+        return supported_onnx_models
+
+    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[np.ndarray]:
+        return output.model_output.astype(np.float32)
+
+    def onnx_embed(self, images: List[ImageInput], **kwargs) -> OnnxOutputContext:
+        with contextlib.ExitStack():
+            image_files = [
+                Image.open(image) if not isinstance(image, Image.Image) else image
+                for image in images
+            ]
+            encoded = self.processor(image_files)
+        onnx_input = self._build_onnx_input(encoded)
+        onnx_input = self._preprocess_onnx_input(onnx_input)
+
+        model_output = self.model.run(None, onnx_input)
+        embeddings = model_output[0].reshape(len(images), *supported_onnx_models[0]["dim"])
+        return OnnxOutputContext(model_output=embeddings)
diff --git a/fastembed/image/image_embedding.py b/fastembed/image/image_embedding.py
@@ -5,10 +5,11 @@
 from fastembed.common import ImageInput, OnnxProvider
 from fastembed.image.image_embedding_base import ImageEmbeddingBase
 from fastembed.image.onnx_embedding import OnnxImageEmbedding
+from fastembed.image.colpali_model import ColpaliImageModel
 
 
 class ImageEmbedding(ImageEmbeddingBase):
-    EMBEDDINGS_REGISTRY: list[Type[ImageEmbeddingBase]] = [OnnxImageEmbedding]
+    EMBEDDINGS_REGISTRY: list[Type[ImageEmbeddingBase]] = [OnnxImageEmbedding, ColpaliImageModel]
 
     @classmethod
     def list_supported_models(cls) -> list[dict[str, Any]]:

diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py
@@ -116,7 +116,7 @@ def _get_convert_to_rgb(transforms: list[Transform], config: dict[str, Any]):
     @staticmethod
     def _get_resize(transforms: list[Transform], config: dict[str, Any]):
         mode = config.get("image_processor_type", "CLIPImageProcessor")
-        if mode == "CLIPImageProcessor":
+        if mode == "CLIPImageProcessor" or mode == "SiglipImageProcessor":
             if config.get("do_resize", False):
                 size = config["size"]
                 if "shortest_edge" in size:
@@ -161,7 +161,7 @@ def _get_resize(transforms: list[Transform], config: dict[str, Any]):
     @staticmethod
     def _get_center_crop(transforms: list[Transform], config: dict[str, Any]):
         mode = config.get("image_processor_type", "CLIPImageProcessor")
-        if mode == "CLIPImageProcessor":
+        if mode == "CLIPImageProcessor" or mode == "SiglipImageProcessor":
             if config.get("do_center_crop", False):
                 crop_size = config["crop_size"]
                 if isinstance(crop_size, int):

diff --git a/fastembed/text/colpali_model.py b/fastembed/text/colpali_model.py
@@ -0,0 +1,88 @@
+from typing import Any, Dict, Iterable, List
+
+import numpy as np
+
+from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.text.onnx_embedding import OnnxTextEmbedding
+
+supported_onnx_models = [
+    {
+        "model": "akshayballal/colpali-v1.2-merged",
+        "dim": (16, 128),
+        "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.",
+        "license": "mit",
+        "size_in_GB": 6.08,
+        "sources": {
+            "hf": "akshayballal/colpali-v1.2-merged-onnx",
+        },
+        "additional_files": [
+            "model.onnx_data",
+            "tokenizer.json",
+            "tokenizer_config.json",
+            "config.json",
+        ],
+        "model_file": "model.onnx",
+    }
+]
+
+
+class ColpaliTextModel(OnnxTextEmbedding):
+    query_prefix = "Query: "
+    bos_token = "<s>"
+    pad_token = "<pad>"
+    query_tokens = [2, 9413]
+    image_placeholder_size = (3, 448, 448)
+
+    def _preprocess_onnx_input(
+        self, onnx_input: Dict[str, np.ndarray], **kwargs
+    ) -> Dict[str, np.ndarray]:
+        empty_image_placeholder = np.zeros(self.image_placeholder_size, dtype=np.float32)
+        onnx_input["pixel_values"] = np.array(
+            [empty_image_placeholder for _ in onnx_input["input_ids"]]
+        )
+        onnx_input["attention_mask"] = np.array([[1] for _ in onnx_input["input_ids"]])
+        return onnx_input
+
+    @classmethod
+    def list_supported_models(cls) -> List[Dict[str, Any]]:
+        """
+        Lists the supported models.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries containing the model information.
+        """
+        return supported_onnx_models
+
+    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[np.ndarray]:
+        return output.model_output.astype(np.float32)
+
+    def _preprocess_queries(self, documents: List[str]):
+        texts_query: List[str] = []
+
+        for query in documents:
+            query = self.bos_token + self.query_prefix + query + self.pad_token * 10
+            query += "\n"
+
+            texts_query.append(query)
+        return texts_query
+
+    def onnx_embed(
+        self,
+        documents: List[str],
+        **kwargs,
+    ) -> OnnxOutputContext:
+        documents = self._preprocess_queries(documents)
+        self.tokenizer.enable_truncation(max_length=10000)
+        encoded = self.tokenize(documents, **kwargs)
+        input_ids = np.array([self.query_tokens + e.ids[2:] for e in encoded])
+
+        attention_mask = np.array([e.attention_mask for e in encoded])
+        onnx_input = {"input_ids": np.array(input_ids, dtype=np.int64)}
+        onnx_input = self._preprocess_onnx_input(onnx_input, **kwargs)
+        onnx_input["attention_mask"] = attention_mask
+        model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)
+        return OnnxOutputContext(
+            model_output=model_output[0],
+            attention_mask=onnx_input.get("attention_mask", attention_mask),
+            input_ids=onnx_input.get("input_ids", input_ids),
+        )
diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py
@@ -79,7 +79,6 @@ def onnx_embed(
             onnx_input["token_type_ids"] = np.array(
                 [np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64
             )
-
         onnx_input = self._preprocess_onnx_input(onnx_input, **kwargs)
 
         model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)

diff --git a/tests/test_image_onnx_embeddings.py b/tests/test_image_onnx_embeddings.py
@@ -21,6 +21,9 @@
     "Qdrant/Unicom-ViT-B-32": np.array(
         [0.0418, 0.0550, 0.0003, 0.0253, -0.0185, 0.0016, -0.0368, -0.0402, -0.0891, -0.0186]
     ),
+    "akshayballal/colpali-v1.2-merged": np.array(
+        [0.01533, 0.05118, 0.05948, 0.02583, -0.06128, -0.02682]
+    ),
 }
 
 
@@ -43,13 +46,19 @@ def test_embedding():
         ]
         embeddings = list(model.embed(images))
         embeddings = np.stack(embeddings, axis=0)
-        assert embeddings.shape == (len(images), dim)
 
         canonical_vector = CANONICAL_VECTOR_VALUES[model_desc["model"]]
 
-        assert np.allclose(
-            embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3
-        ), model_desc["model"]
+        if isinstance(dim, tuple):
+            assert embeddings.shape == (len(images), *dim)
+            assert np.allclose(
+                embeddings[0][0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3
+            ), model_desc["model"]
+        else:
+            assert embeddings.shape == (len(images), dim)
+            assert np.allclose(
+                embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3
+            ), model_desc["model"]
 
         assert np.allclose(embeddings[1], embeddings[2]), model_desc["model"]
 

diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py
@@ -64,6 +64,15 @@
     ),
     "snowflake/snowflake-arctic-embed-l": np.array([0.0189, -0.0673, 0.0183, 0.0124, 0.0146]),
     "Qdrant/clip-ViT-B-32-text": np.array([0.0083, 0.0103, -0.0138, 0.0199, -0.0069]),
+    "akshayballal/colpali-v1.2-merged": [
+        0.1581,
+        -0.03748,
+        0.09265,
+        -0.0002161,
+        0.0762,
+        0.02055,
+        0.09937,
+    ],
 }
 
 
@@ -80,12 +89,19 @@ def test_embedding():
         docs = ["hello world", "flag embedding"]
         embeddings = list(model.embed(docs))
         embeddings = np.stack(embeddings, axis=0)
-        assert embeddings.shape == (2, dim)
 
         canonical_vector = CANONICAL_VECTOR_VALUES[model_desc["model"]]
-        assert np.allclose(
-            embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3
-        ), model_desc["model"]
+
+        if isinstance(dim, tuple):
+            assert embeddings.shape == (len(docs), *dim)
+            assert np.allclose(
+                embeddings[0][0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3
+            ), model_desc["model"]
+        else:
+            assert embeddings.shape == (len(docs), dim)
+            assert np.allclose(
+                embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3
+            ), model_desc["model"]
         if is_ci:
             delete_model_cache(model.model._model_dir)