Refactor onnx_embed_image

kacperlukawski · kacperlukawski · commit 746693d5c59d · 2025-12-09T15:40:38.000+01:00
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -178,42 +178,13 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
             assert self.processor is not None, "Processor is not initialized"
             processed = self.processor(image_files)
 
-            # Handle nested structure (with image splitting)
+            # Dispatch to appropriate handler based on structure.
+            # ColModernVBERT processors divides the original image into
+            # subimages and processes them separately.
             if isinstance(processed[0], list):
-                # processed = [[img1_patches], [img2_patches], ...]
-                # Need shape: (batch_size, max_patches, C, H, W)
-
-                patch_counts = [len(patches) for patches in processed]
-                max_patches = max(patch_counts)
-
-                # Get dimensions from first patch
-                C, H, W = processed[0][0].shape
-
-                # Create padded array
-                batch_size = len(processed)
-                encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype)
-
-                # Create attention mask (1 for real patches, 0 for padding)
-                attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64)
-
-                # Fill in patches and attention mask
-                for i, patches in enumerate(processed):
-                    for j, patch in enumerate(patches):
-                        encoded[i, j] = patch
-                        attention_mask[i, j] = 1
-
-                # Track actual patch counts for later use
-                metadata = {"patch_counts": patch_counts}
+                encoded, attention_mask, metadata = self._process_nested_patches(processed)
             else:
-                # Flat structure (no splitting) - still need batch dimension
-                # Shape: (batch_size, 1, C, H, W)
-                encoded = np.array(processed)
-                if len(encoded.shape) == 4:  # (batch_size, C, H, W)
-                    encoded = encoded[:, np.newaxis, ...]  # Add num_patches=1 dimension
-
-                # All patches are real (no padding)
-                attention_mask = np.ones((len(images), encoded.shape[1]), dtype=np.int64)
-                metadata = {"patch_counts": [encoded.shape[1]] * len(images)}
+                encoded, attention_mask, metadata = self._process_flat_images(processed, len(images))
 
         onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask}
         onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
@@ -225,6 +196,105 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
             metadata=metadata,
         )
 
+    def _process_nested_patches(
+        self, processed: list[list[NumpyArray]]
+    ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]:
+        """
+        Process nested image patches (from ImageSplitter).
+
+        Args:
+            processed: List of patch lists, one per image [[img1_patches], [img2_patches], ...]
+
+        Returns:
+            tuple: (encoded array, attention_mask, metadata)
+                - encoded: (batch_size, max_patches, C, H, W)
+                - attention_mask: (batch_size, max_patches) with 1 for real patches, 0 for padding
+                - metadata: Dict with 'patch_counts' key
+        """
+        patch_counts = [len(patches) for patches in processed]
+        max_patches = max(patch_counts)
+
+        # Get dimensions from first patch
+        C, H, W = processed[0][0].shape
+        batch_size = len(processed)
+
+        # Create padded array
+        encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype)
+
+        # Create attention mask (1 for real patches, 0 for padding)
+        attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64)
+
+        # Fill in patches and attention mask
+        for i, patches in enumerate(processed):
+            for j, patch in enumerate(patches):
+                encoded[i, j] = patch
+                attention_mask[i, j] = 1
+
+        metadata = {"patch_counts": patch_counts}
+        return encoded, attention_mask, metadata
+
+    def _process_flat_images(
+        self, processed: list[NumpyArray], num_images: int
+    ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]:
+        """
+        Process flat image arrays (from standard processors like SiglipImageProcessor).
+
+        For models expecting 5D input (Idefics3-based), adds patch dimension.
+        For models expecting 4D input, keeps original shape.
+
+        Args:
+            processed: List of image arrays
+            num_images: Number of images being processed
+
+        Returns:
+            tuple: (encoded array, attention_mask, metadata)
+                - encoded: (batch_size, C, H, W) for 4D models OR (batch_size, 1, C, H, W) for 5D models
+                - attention_mask: (batch_size, 1) with all ones
+                - metadata: Dict with 'patch_counts' key
+        """
+        encoded = np.array(processed)
+
+        # Check if model needs patch dimension based on ONNX signature
+        if len(encoded.shape) == 4 and self._needs_patch_dimension():
+            # Add patch dimension for Idefics3-based models: (batch, 1, C, H, W)
+            encoded = encoded[:, np.newaxis, ...]
+
+        # Determine attention mask shape based on final tensor shape
+        if len(encoded.shape) == 5:
+            # 5D tensor: attention_mask shape is (batch, num_patches)
+            attention_mask = np.ones((num_images, encoded.shape[1]), dtype=np.int64)
+            metadata = {"patch_counts": [encoded.shape[1]] * num_images}
+        else:
+            # 4D tensor: attention_mask shape is (batch, 1)
+            attention_mask = np.ones((num_images, 1), dtype=np.int64)
+            metadata = {"patch_counts": [1] * num_images}
+
+        return encoded, attention_mask, metadata
+
+    def _needs_patch_dimension(self) -> bool:
+        """
+        Determine if this model needs the patch dimension by checking ONNX input shape.
+
+        Idefics3-based models (like ColModernVBERT) need 5D tensors (batch_size, patch_count, C, H, W).
+        Earlier models (like ColPali v1.3) need 4D tensors (batch_size, C, H, W).
+
+        Returns:
+            bool: True if pixel_values input has 5 dimensions, False if 4 dimensions
+        """
+        if not hasattr(self, "model") or self.model is None:
+            return False
+
+        # Get pixel_values input metadata
+        for input_meta in self.model.get_inputs():
+            if input_meta.name == "pixel_values":
+                # input_meta.shape is a list like
+                #     ['batch_size', 'sequence_length', 'num_channels', 'height', 'width']
+                #  or ['batch_size', 'num_channels', 'height', 'width']
+                return len(input_meta.shape) == 5
+
+        # Default to False for backward compatibility
+        return False
+
     def _embed_images(
         self,
         model_name: str,
diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
@@ -48,13 +48,10 @@
     ),
     "Qdrant/colmodernvbert": np.array(
         [
-            [0.05, 0.0656, 0.0403, 0.1498, 0.1842, 0.0263, -0.1871],
-            [-0.0566, -0.1403, 0.0065, -0.0285, 0.0903, -0.0149, 0.1069],
-            [-0.1015, -0.0072, 0.0908, -0.0824, -0.0185, -0.0097, -0.0046],
-            [-0.1233, -0.1081, -0.0234, -0.0033, 0.0598, 0.0993, 0.0985],
-            [-0.0705, -0.1312, -0.0649, 0.0151, 0.0746, 0.0765, 0.1482],
-            [0.0053, -0.1384, -0.0584, -0.0272, 0.1301, 0.0508, 0.1796],
-            [0.0092, -0.1438, -0.0306, -0.0369, 0.1172, 0.037, 0.1334],
+            [0.0541, 0.0677, 0.0392, 0.1494, 0.1855, 0.0275, -0.1835, -0.1025, -0.1204, -0.0835],
+            [-0.0515, -0.1328, 0.0298, -0.0574, 0.0829, -0.0836, 0.0888, 0.0138, 0.0741, 0.0293],
+            [-0.1114, -0.0506, 0.0666, -0.1064, -0.0229, -0.0486, -0.007, 0.0932, 0.0054, 0.1113],
+            [0.2317, -0.0518, 0.0248, -0.0075, -0.078, 0.2073, -0.0912, -0.0622, -0.0203, 0.093]
         ]
     ),
 }

Original file line number	Diff line number	Diff line change
`@@ -48,13 +48,10 @@`
`48`	`48`	`),`
`49`	`49`	`"Qdrant/colmodernvbert": np.array(`
`50`	`50`	`[`
`51`		`- [0.05, 0.0656, 0.0403, 0.1498, 0.1842, 0.0263, -0.1871],`
`52`		`- [-0.0566, -0.1403, 0.0065, -0.0285, 0.0903, -0.0149, 0.1069],`
`53`		`- [-0.1015, -0.0072, 0.0908, -0.0824, -0.0185, -0.0097, -0.0046],`
`54`		`- [-0.1233, -0.1081, -0.0234, -0.0033, 0.0598, 0.0993, 0.0985],`
`55`		`- [-0.0705, -0.1312, -0.0649, 0.0151, 0.0746, 0.0765, 0.1482],`
`56`		`- [0.0053, -0.1384, -0.0584, -0.0272, 0.1301, 0.0508, 0.1796],`
`57`		`- [0.0092, -0.1438, -0.0306, -0.0369, 0.1172, 0.037, 0.1334],`
	`51`	`+ [0.0541, 0.0677, 0.0392, 0.1494, 0.1855, 0.0275, -0.1835, -0.1025, -0.1204, -0.0835],`
	`52`	`+ [-0.0515, -0.1328, 0.0298, -0.0574, 0.0829, -0.0836, 0.0888, 0.0138, 0.0741, 0.0293],`
	`53`	`+ [-0.1114, -0.0506, 0.0666, -0.1064, -0.0229, -0.0486, -0.007, 0.0932, 0.0054, 0.1113],`
	`54`	`+ [0.2317, -0.0518, 0.0248, -0.0075, -0.078, 0.2073, -0.0912, -0.0622, -0.0203, 0.093]`
`58`	`55`	`]`
`59`	`56`	`),`
`60`	`57`	`}`