Skip to content

Commit 800f388

Browse files
Model: ModernVBERT/colmodernvbert (#588)
* Add ColModernVBERT to LateInteractionMultimodalEmbedding registry * Implement image processing based on Idefics3ImageProcessor logic * Fix padding support * Implement ColModernVBERT logic * Remove TODOs * Handle empty pixel values with proper image_size * Add ColModernVBERT tests * Run pre-commit * mypy fixes * mypy fixes * mypy fixes * mypy fixes * Fix typo in the class name * Add processor_config.json to additional files * Fix mypy errors * Refactor onnx_embed_image * Fix mypy errors * fix: colmodernvbert tests and query processing * fix: remove Union references * fix: fix exit stack, update tests, implement token count * fix: uncomment colpali in tests * fix: lowercase models to cache * fix: fix models to cache * refactor: move colmodernvbert related onnx embed to its class --------- Co-authored-by: George Panchuk <george.panchuk@qdrant.tech>
1 parent 020d535 commit 800f388

File tree

9 files changed

+950
-61
lines changed

9 files changed

+950
-61
lines changed

fastembed/common/onnx_model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class OnnxOutputContext:
2121
model_output: NumpyArray
2222
attention_mask: NDArray[np.int64] | None = None
2323
input_ids: NDArray[np.int64] | None = None
24+
metadata: dict[str, Any] | None = None
2425

2526

2627
class OnnxModel(Generic[T]):

fastembed/common/preprocessor_utils.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,10 @@ def load_tokenizer(model_dir: Path) -> tuple[Tokenizer, dict[str, int]]:
5050

5151
tokenizer = Tokenizer.from_file(str(tokenizer_path))
5252
tokenizer.enable_truncation(max_length=max_context)
53-
tokenizer.enable_padding(
54-
pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"]
55-
)
53+
if not tokenizer.padding:
54+
tokenizer.enable_padding(
55+
pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"]
56+
)
5657

5758
for token in tokens_map.values():
5859
if isinstance(token, str):

fastembed/image/onnx_image_model.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,11 @@ def _build_onnx_input(self, encoded: NumpyArray) -> dict[str, NumpyArray]:
7676
return {input_name: encoded}
7777

7878
def onnx_embed(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext:
79-
with contextlib.ExitStack():
79+
with contextlib.ExitStack() as stack:
8080
image_files = [
81-
Image.open(image) if not isinstance(image, Image.Image) else image
81+
stack.enter_context(Image.open(image))
82+
if not isinstance(image, Image.Image)
83+
else image
8284
for image in images
8385
]
8486
assert self.processor is not None, "Processor is not initialized"

fastembed/image/transform/functional.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,77 @@ def pad2square(
145145
new_image = Image.new(mode="RGB", size=(size, size), color=fill_color)
146146
new_image.paste(image.crop((left, top, right, bottom)) if crop_required else image)
147147
return new_image
148+
149+
150+
def resize_longest_edge(
151+
image: Image.Image,
152+
max_size: int,
153+
resample: int | Image.Resampling = Image.Resampling.LANCZOS,
154+
) -> Image.Image:
155+
height, width = image.height, image.width
156+
aspect_ratio = width / height
157+
158+
if width >= height:
159+
# Width is longer
160+
new_width = max_size
161+
new_height = int(new_width / aspect_ratio)
162+
else:
163+
# Height is longer
164+
new_height = max_size
165+
new_width = int(new_height * aspect_ratio)
166+
167+
# Ensure even dimensions
168+
if new_height % 2 != 0:
169+
new_height += 1
170+
if new_width % 2 != 0:
171+
new_width += 1
172+
173+
return image.resize((new_width, new_height), resample)
174+
175+
176+
def crop_ndarray(
177+
image: NumpyArray,
178+
x1: int,
179+
y1: int,
180+
x2: int,
181+
y2: int,
182+
channel_first: bool = True,
183+
) -> NumpyArray:
184+
if channel_first:
185+
# (C, H, W) format
186+
return image[:, y1:y2, x1:x2]
187+
else:
188+
# (H, W, C) format
189+
return image[y1:y2, x1:x2, :]
190+
191+
192+
def resize_ndarray(
193+
image: NumpyArray,
194+
size: tuple[int, int],
195+
resample: int | Image.Resampling = Image.Resampling.LANCZOS,
196+
channel_first: bool = True,
197+
) -> NumpyArray:
198+
# Convert to PIL-friendly format (H, W, C)
199+
if channel_first:
200+
img_hwc = image.transpose((1, 2, 0))
201+
else:
202+
img_hwc = image
203+
204+
# Handle different dtypes
205+
if img_hwc.dtype == np.float32 or img_hwc.dtype == np.float64:
206+
# Assume normalized, scale to 0-255 for PIL
207+
img_hwc_scaled = (img_hwc * 255).astype(np.uint8)
208+
pil_img = Image.fromarray(img_hwc_scaled, mode="RGB")
209+
resized = pil_img.resize(size, resample)
210+
result = np.array(resized).astype(np.float32) / 255.0
211+
else:
212+
# uint8 or similar
213+
pil_img = Image.fromarray(img_hwc.astype(np.uint8), mode="RGB")
214+
resized = pil_img.resize(size, resample)
215+
result = np.array(resized)
216+
217+
# Convert back to original format
218+
if channel_first:
219+
result = result.transpose((2, 0, 1))
220+
221+
return result

0 commit comments

Comments
 (0)