Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
8a32898
chore: Trigger CI test
hh-space-invader Feb 24, 2025
f85c528
chore: Trigger CI test
hh-space-invader Feb 24, 2025
319cd9e
chore: Trigger CI test
hh-space-invader Feb 24, 2025
51c45e4
chore: Trigger CI test
hh-space-invader Feb 24, 2025
03533fc
chore: Trigger CI test
hh-space-invader Feb 24, 2025
32cf1b2
chore: Trigger CI test
hh-space-invader Feb 24, 2025
60f924c
chore: Trigger CI test
hh-space-invader Feb 24, 2025
7455c01
chore: Trigger CI test
hh-space-invader Feb 24, 2025
2628bc6
chore: Trigger CI test
hh-space-invader Feb 24, 2025
9a0f828
Trigger CI
hh-space-invader Feb 25, 2025
bcc1e81
Trigger CI
hh-space-invader Feb 25, 2025
ee6df94
Trigger CI
hh-space-invader Feb 25, 2025
5364c1b
Trigger CI test
hh-space-invader Feb 25, 2025
804cb39
Trigger CI test
hh-space-invader Feb 25, 2025
65ea860
Trigger CI test
hh-space-invader Feb 25, 2025
87dd019
Trigger CI test
hh-space-invader Feb 25, 2025
191b3f3
Trigger CI test
hh-space-invader Feb 25, 2025
742669a
Trigger CI test
hh-space-invader Feb 25, 2025
f523068
Trigger CI test
hh-space-invader Feb 25, 2025
60870c6
Trigger CI test
hh-space-invader Feb 25, 2025
d628d1e
Trigger CI test
hh-space-invader Feb 25, 2025
d52ddd5
Trigger CI test
hh-space-invader Feb 25, 2025
509b526
Trigger CI test
hh-space-invader Feb 25, 2025
73eb1d1
Trigger CI test
hh-space-invader Feb 25, 2025
04f6f75
Trigger CI test
hh-space-invader Feb 25, 2025
6d1ed5c
Trigger CI test
hh-space-invader Feb 25, 2025
9419845
Trigger CI test
hh-space-invader Feb 25, 2025
017bb6b
Trigger CI test
hh-space-invader Feb 25, 2025
5c83bf6
Trigger CI test
hh-space-invader Feb 25, 2025
4577887
Trigger CI test
hh-space-invader Feb 25, 2025
5c27586
Trigger CI test
hh-space-invader Feb 25, 2025
0c6c0ea
Trigger CI test
hh-space-invader Feb 25, 2025
299c69e
Trigger CI test
hh-space-invader Feb 25, 2025
5b9b6e2
Trigger CI test
hh-space-invader Feb 25, 2025
cc89ef6
Trigger CI test
hh-space-invader Feb 25, 2025
e5df3ed
Trigger CI test
hh-space-invader Feb 25, 2025
52a6de3
Trigger CI test
hh-space-invader Feb 25, 2025
ee731b7
Trigger CI test
hh-space-invader Feb 25, 2025
3ce8ff8
new: Added on workflow dispatch
hh-space-invader Feb 27, 2025
3fbabd0
tests: Updated tests
hh-space-invader Feb 27, 2025
dde8490
fix: Fix CI
hh-space-invader Feb 27, 2025
b12fcd9
fix: Fix CI
hh-space-invader Feb 27, 2025
fd63343
fix: Fix CI
hh-space-invader Feb 27, 2025
bd00d5a
improve: Prevent stop iteration error caused by next
hh-space-invader Feb 27, 2025
d961848
fix: Fix variable might be referenced before assignment
hh-space-invader Feb 28, 2025
51e34b4
refactor: Revised the way of getting models to test
hh-space-invader Feb 28, 2025
acbe80f
fix: Fix test in image model
hh-space-invader Feb 28, 2025
9542875
refactor: Call one model
hh-space-invader Feb 28, 2025
7101fa1
fix: Fix ci
hh-space-invader Feb 28, 2025
c3a03d3
fix: Fix splade model name
hh-space-invader Feb 28, 2025
6c69d40
tests: Updated tests
hh-space-invader Mar 4, 2025
643ccc9
chore: Remove cache
hh-space-invader Mar 4, 2025
8bb521a
tests: Update multi task tests
hh-space-invader Mar 4, 2025
4092ba6
tests: Update multi task tests
hh-space-invader Mar 4, 2025
3826294
tests: Updated tests
hh-space-invader Mar 5, 2025
cef3caa
Merge branch 'main' into speedup-ci
hh-space-invader Mar 5, 2025
bf9bb25
refactor: refactor utils func, add comments, conditions refactor
joein Mar 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
name: Tests

on:
push:
branches: [ master, main, gpu ]
pull_request:
branches: [ master, main, gpu ]
workflow_dispatch:


env:
CARGO_TERM_COLOR: always
Expand Down Expand Up @@ -42,4 +43,4 @@ jobs:

- name: Run pytest
run: |
poetry run pytest
poetry run pytest
12 changes: 9 additions & 3 deletions tests/test_image_onnx_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from fastembed import ImageEmbedding
from tests.config import TEST_MISC_DIR
from tests.utils import delete_model_cache
from tests.utils import delete_model_cache, should_test_model

CANONICAL_VECTOR_VALUES = {
"Qdrant/clip-ViT-B-32-vision": np.array([-0.0098, 0.0128, -0.0274, 0.002, -0.0059]),
Expand All @@ -27,11 +27,13 @@
}


def test_embedding() -> None:
@pytest.mark.parametrize("model_name", ["Qdrant/clip-ViT-B-32-vision"])
def test_embedding(model_name: str) -> None:
is_ci = os.getenv("CI")
is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"

for model_desc in ImageEmbedding._list_supported_models():
if not is_ci and model_desc.size_in_GB > 1:
if not should_test_model(model_desc, model_name, is_ci, is_manual):
continue

dim = model_desc.dim
Expand Down Expand Up @@ -74,8 +76,12 @@ def test_batch_embedding(n_dims: int, model_name: str) -> None:

embeddings = list(model.embed(images, batch_size=10))
embeddings = np.stack(embeddings, axis=0)
assert np.allclose(embeddings[1], embeddings[2])

canonical_vector = CANONICAL_VECTOR_VALUES[model_name]

assert embeddings.shape == (len(test_images) * n_images, n_dims)
assert np.allclose(embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3)
if is_ci:
delete_model_cache(model.model._model_dir)

Expand Down
54 changes: 32 additions & 22 deletions tests/test_late_interaction_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from fastembed.late_interaction.late_interaction_text_embedding import (
LateInteractionTextEmbedding,
)
from tests.utils import delete_model_cache
from tests.utils import delete_model_cache, should_test_model

# vectors are abridged and rounded for brevity
CANONICAL_COLUMN_VALUES = {
Expand Down Expand Up @@ -153,57 +153,70 @@
docs = ["Hello World"]


def test_batch_embedding():
@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
def test_batch_embedding(model_name: str):
is_ci = os.getenv("CI")
docs_to_embed = docs * 10

for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
print("evaluating", model_name)
model = LateInteractionTextEmbedding(model_name=model_name)
result = list(model.embed(docs_to_embed, batch_size=6))
model = LateInteractionTextEmbedding(model_name=model_name)
result = list(model.embed(docs_to_embed, batch_size=6))
expected_result = CANONICAL_COLUMN_VALUES[model_name]

for value in result:
token_num, abridged_dim = expected_result.shape
assert np.allclose(value[:, :abridged_dim], expected_result, atol=2e-3)
for value in result:
token_num, abridged_dim = expected_result.shape
assert np.allclose(value[:, :abridged_dim], expected_result, atol=2e-3)

if is_ci:
delete_model_cache(model.model._model_dir)
if is_ci:
delete_model_cache(model.model._model_dir)


def test_single_embedding():
@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
def test_single_embedding(model_name: str):
is_ci = os.getenv("CI")
is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"
docs_to_embed = docs

for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
for model_desc in LateInteractionTextEmbedding._list_supported_models():
if not should_test_model(model_desc, model_name, is_ci, is_manual):
continue

print("evaluating", model_name)
model = LateInteractionTextEmbedding(model_name=model_name)
result = next(iter(model.embed(docs_to_embed, batch_size=6)))
expected_result = CANONICAL_COLUMN_VALUES[model_name]
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:, :abridged_dim], expected_result, atol=2e-3)

if is_ci:
delete_model_cache(model.model._model_dir)


def test_single_embedding_query():
@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
def test_single_embedding_query(model_name: str):
is_ci = os.getenv("CI")
is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"
queries_to_embed = docs

for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
for model_desc in LateInteractionTextEmbedding._list_supported_models():
if not should_test_model(model_desc, model_name, is_ci, is_manual):
continue

print("evaluating", model_name)
model = LateInteractionTextEmbedding(model_name=model_name)
result = next(iter(model.query_embed(queries_to_embed)))
expected_result = CANONICAL_QUERY_VALUES[model_name]
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:, :abridged_dim], expected_result, atol=2e-3)

if is_ci:
delete_model_cache(model.model._model_dir)


def test_parallel_processing():
@pytest.mark.parametrize("token_dim,model_name", [(96, "answerdotai/answerai-colbert-small-v1")])
def test_parallel_processing(token_dim: int, model_name: str):
is_ci = os.getenv("CI")
model = LateInteractionTextEmbedding(model_name="colbert-ir/colbertv2.0")
token_dim = 128
model = LateInteractionTextEmbedding(model_name=model_name)

docs = ["hello world", "flag embedding"] * 100
embeddings = list(model.embed(docs, batch_size=10, parallel=2))
embeddings = np.stack(embeddings, axis=0)
Expand All @@ -222,10 +235,7 @@ def test_parallel_processing():
delete_model_cache(model.model._model_dir)


@pytest.mark.parametrize(
"model_name",
["colbert-ir/colbertv2.0"],
)
@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
def test_lazy_load(model_name: str):
is_ci = os.getenv("CI")

Expand Down
55 changes: 28 additions & 27 deletions tests/test_late_interaction_multimodal.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

import pytest
from PIL import Image
import numpy as np

Expand Down Expand Up @@ -45,38 +46,38 @@


def test_batch_embedding():
is_ci = os.getenv("CI")
if os.getenv("CI"):
pytest.skip("Colpali is too large to test in CI")

if not is_ci:
for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = list(model.embed_image(images, batch_size=2))
for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = list(model.embed_image(images, batch_size=2))

for value in result:
token_num, abridged_dim = expected_result.shape
assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3)
for value in result:
token_num, abridged_dim = expected_result.shape
assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3)


def test_single_embedding():
is_ci = os.getenv("CI")
if not is_ci:
for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = next(iter(model.embed_image(images, batch_size=6)))
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
if os.getenv("CI"):
pytest.skip("Colpali is too large to test in CI")

for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = next(iter(model.embed_image(images, batch_size=6)))
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)


def test_single_embedding_query():
is_ci = os.getenv("CI")
if not is_ci:
queries_to_embed = queries

for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = next(iter(model.embed_text(queries_to_embed)))
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
if os.getenv("CI"):
pytest.skip("Colpali is too large to test in CI")

for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = next(iter(model.embed_text(queries)))
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
49 changes: 30 additions & 19 deletions tests/test_sparse_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

from fastembed.sparse.bm25 import Bm25
from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding
from tests.utils import delete_model_cache
from tests.utils import delete_model_cache, should_test_model

CANONICAL_COLUMN_VALUES = {
"prithvida/Splade_PP_en_v1": {
"prithivida/Splade_PP_en_v1": {
"indices": [
2040,
2047,
Expand Down Expand Up @@ -49,28 +49,41 @@
docs = ["Hello World"]


def test_batch_embedding() -> None:
@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
def test_batch_embedding(model_name: str) -> None:
is_ci = os.getenv("CI")
docs_to_embed = docs * 10

for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
model = SparseTextEmbedding(model_name=model_name)
result = next(iter(model.embed(docs_to_embed, batch_size=6)))
assert result.indices.tolist() == expected_result["indices"]
model = SparseTextEmbedding(model_name=model_name)
result = next(iter(model.embed(docs_to_embed, batch_size=6)))
expected_result = CANONICAL_COLUMN_VALUES[model_name]
assert result.indices.tolist() == expected_result["indices"]

for i, value in enumerate(result.values):
assert pytest.approx(value, abs=0.001) == expected_result["values"][i]
if is_ci:
delete_model_cache(model.model._model_dir)
for i, value in enumerate(result.values):
assert pytest.approx(value, abs=0.001) == expected_result["values"][i]
if is_ci:
delete_model_cache(model.model._model_dir)


def test_single_embedding() -> None:
@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
def test_single_embedding(model_name: str) -> None:
is_ci = os.getenv("CI")
for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"

for model_desc in SparseTextEmbedding._list_supported_models():
if (
model_desc.model not in CANONICAL_COLUMN_VALUES
): # attention models and bm25 are also parts of
# SparseTextEmbedding, however, they have their own tests
continue
if not should_test_model(model_desc, model_name, is_ci, is_manual):
continue

model = SparseTextEmbedding(model_name=model_name)

passage_result = next(iter(model.embed(docs, batch_size=6)))
query_result = next(iter(model.query_embed(docs)))
expected_result = CANONICAL_COLUMN_VALUES[model_name]
for result in [passage_result, query_result]:
assert result.indices.tolist() == expected_result["indices"]

Expand All @@ -80,9 +93,10 @@ def test_single_embedding() -> None:
delete_model_cache(model.model._model_dir)


def test_parallel_processing() -> None:
@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
def test_parallel_processing(model_name: str) -> None:
is_ci = os.getenv("CI")
model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
model = SparseTextEmbedding(model_name=model_name)
docs = ["hello world", "flag embedding"] * 30
sparse_embeddings_duo = list(model.embed(docs, batch_size=10, parallel=2))
sparse_embeddings_all = list(model.embed(docs, batch_size=10, parallel=0))
Expand Down Expand Up @@ -172,10 +186,7 @@ def test_disable_stemmer_behavior(disable_stemmer: bool) -> None:
assert result == expected, f"Expected {expected}, but got {result}"


@pytest.mark.parametrize(
"model_name",
["prithivida/Splade_PP_en_v1"],
)
@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
def test_lazy_load(model_name: str) -> None:
is_ci = os.getenv("CI")
model = SparseTextEmbedding(model_name=model_name, lazy_load=True)
Expand Down
Loading