qdrant · hh-space-invader · Mar 6, 2025 · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
@@ -1,9 +1,10 @@
 name: Tests
 
 on:
-  push:
-    branches: [ master, main, gpu ]
   pull_request:
+    branches: [ master, main, gpu ]
+  workflow_dispatch:
+
 
 env:
   CARGO_TERM_COLOR: always
@@ -42,4 +43,4 @@ jobs:
 
       - name: Run pytest
         run: |
-          poetry run pytest 
+          poetry run pytest 
diff --git a/tests/test_image_onnx_embeddings.py b/tests/test_image_onnx_embeddings.py
@@ -8,7 +8,7 @@
 
 from fastembed import ImageEmbedding
 from tests.config import TEST_MISC_DIR
-from tests.utils import delete_model_cache
+from tests.utils import delete_model_cache, should_test_model
 
 CANONICAL_VECTOR_VALUES = {
     "Qdrant/clip-ViT-B-32-vision": np.array([-0.0098, 0.0128, -0.0274, 0.002, -0.0059]),
@@ -27,11 +27,13 @@
 }
 
 
-def test_embedding() -> None:
+@pytest.mark.parametrize("model_name", ["Qdrant/clip-ViT-B-32-vision"])
+def test_embedding(model_name: str) -> None:
     is_ci = os.getenv("CI")
+    is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"
 
     for model_desc in ImageEmbedding._list_supported_models():
-        if not is_ci and model_desc.size_in_GB > 1:
+        if not should_test_model(model_desc, model_name, is_ci, is_manual):
             continue
 
         dim = model_desc.dim
@@ -74,8 +76,12 @@ def test_batch_embedding(n_dims: int, model_name: str) -> None:
 
     embeddings = list(model.embed(images, batch_size=10))
     embeddings = np.stack(embeddings, axis=0)
+    assert np.allclose(embeddings[1], embeddings[2])
+
+    canonical_vector = CANONICAL_VECTOR_VALUES[model_name]
 
     assert embeddings.shape == (len(test_images) * n_images, n_dims)
+    assert np.allclose(embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3)
     if is_ci:
         delete_model_cache(model.model._model_dir)
 

diff --git a/tests/test_late_interaction_embeddings.py b/tests/test_late_interaction_embeddings.py
@@ -6,7 +6,7 @@
 from fastembed.late_interaction.late_interaction_text_embedding import (
     LateInteractionTextEmbedding,
 )
-from tests.utils import delete_model_cache
+from tests.utils import delete_model_cache, should_test_model
 
 # vectors are abridged and rounded for brevity
 CANONICAL_COLUMN_VALUES = {
@@ -153,57 +153,70 @@
 docs = ["Hello World"]
 
 
-def test_batch_embedding():
+@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
+def test_batch_embedding(model_name: str):
     is_ci = os.getenv("CI")
     docs_to_embed = docs * 10
 
-    for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
-        print("evaluating", model_name)
-        model = LateInteractionTextEmbedding(model_name=model_name)
-        result = list(model.embed(docs_to_embed, batch_size=6))
+    model = LateInteractionTextEmbedding(model_name=model_name)
+    result = list(model.embed(docs_to_embed, batch_size=6))
+    expected_result = CANONICAL_COLUMN_VALUES[model_name]
 
-        for value in result:
-            token_num, abridged_dim = expected_result.shape
-            assert np.allclose(value[:, :abridged_dim], expected_result, atol=2e-3)
+    for value in result:
+        token_num, abridged_dim = expected_result.shape
+        assert np.allclose(value[:, :abridged_dim], expected_result, atol=2e-3)
 
-        if is_ci:
-            delete_model_cache(model.model._model_dir)
+    if is_ci:
+        delete_model_cache(model.model._model_dir)
 
 
-def test_single_embedding():
+@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
+def test_single_embedding(model_name: str):
     is_ci = os.getenv("CI")
+    is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"
     docs_to_embed = docs
 
-    for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
+    for model_desc in LateInteractionTextEmbedding._list_supported_models():
+        if not should_test_model(model_desc, model_name, is_ci, is_manual):
+            continue
+
         print("evaluating", model_name)
         model = LateInteractionTextEmbedding(model_name=model_name)
         result = next(iter(model.embed(docs_to_embed, batch_size=6)))
+        expected_result = CANONICAL_COLUMN_VALUES[model_name]
         token_num, abridged_dim = expected_result.shape
         assert np.allclose(result[:, :abridged_dim], expected_result, atol=2e-3)
 
         if is_ci:
             delete_model_cache(model.model._model_dir)
 
 
-def test_single_embedding_query():
+@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
+def test_single_embedding_query(model_name: str):
     is_ci = os.getenv("CI")
+    is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"
     queries_to_embed = docs
 
-    for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
+    for model_desc in LateInteractionTextEmbedding._list_supported_models():
+        if not should_test_model(model_desc, model_name, is_ci, is_manual):
+            continue
+
         print("evaluating", model_name)
         model = LateInteractionTextEmbedding(model_name=model_name)
         result = next(iter(model.query_embed(queries_to_embed)))
+        expected_result = CANONICAL_QUERY_VALUES[model_name]
         token_num, abridged_dim = expected_result.shape
         assert np.allclose(result[:, :abridged_dim], expected_result, atol=2e-3)
 
         if is_ci:
             delete_model_cache(model.model._model_dir)
 
 
-def test_parallel_processing():
+@pytest.mark.parametrize("token_dim,model_name", [(96, "answerdotai/answerai-colbert-small-v1")])
+def test_parallel_processing(token_dim: int, model_name: str):
     is_ci = os.getenv("CI")
-    model = LateInteractionTextEmbedding(model_name="colbert-ir/colbertv2.0")
-    token_dim = 128
+    model = LateInteractionTextEmbedding(model_name=model_name)
+
     docs = ["hello world", "flag embedding"] * 100
     embeddings = list(model.embed(docs, batch_size=10, parallel=2))
     embeddings = np.stack(embeddings, axis=0)
@@ -222,10 +235,7 @@ def test_parallel_processing():
         delete_model_cache(model.model._model_dir)
 
 
-@pytest.mark.parametrize(
-    "model_name",
-    ["colbert-ir/colbertv2.0"],
-)
+@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
 def test_lazy_load(model_name: str):
     is_ci = os.getenv("CI")
 

diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
@@ -1,5 +1,6 @@
 import os
 
+import pytest
 from PIL import Image
 import numpy as np
 
@@ -45,38 +46,38 @@
 
 
 def test_batch_embedding():
-    is_ci = os.getenv("CI")
+    if os.getenv("CI"):
+        pytest.skip("Colpali is too large to test in CI")
 
-    if not is_ci:
-        for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
-            print("evaluating", model_name)
-            model = LateInteractionMultimodalEmbedding(model_name=model_name)
-            result = list(model.embed_image(images, batch_size=2))
+    for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
+        print("evaluating", model_name)
+        model = LateInteractionMultimodalEmbedding(model_name=model_name)
+        result = list(model.embed_image(images, batch_size=2))
 
-            for value in result:
-                token_num, abridged_dim = expected_result.shape
-                assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3)
+        for value in result:
+            token_num, abridged_dim = expected_result.shape
+            assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3)
 
 
 def test_single_embedding():
-    is_ci = os.getenv("CI")
-    if not is_ci:
-        for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
-            print("evaluating", model_name)
-            model = LateInteractionMultimodalEmbedding(model_name=model_name)
-            result = next(iter(model.embed_image(images, batch_size=6)))
-            token_num, abridged_dim = expected_result.shape
-            assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
+    if os.getenv("CI"):
+        pytest.skip("Colpali is too large to test in CI")
+
+    for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
+        print("evaluating", model_name)
+        model = LateInteractionMultimodalEmbedding(model_name=model_name)
+        result = next(iter(model.embed_image(images, batch_size=6)))
+        token_num, abridged_dim = expected_result.shape
+        assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
 
 
 def test_single_embedding_query():
-    is_ci = os.getenv("CI")
-    if not is_ci:
-        queries_to_embed = queries
-
-        for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
-            print("evaluating", model_name)
-            model = LateInteractionMultimodalEmbedding(model_name=model_name)
-            result = next(iter(model.embed_text(queries_to_embed)))
-            token_num, abridged_dim = expected_result.shape
-            assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
+    if os.getenv("CI"):
+        pytest.skip("Colpali is too large to test in CI")
+
+    for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
+        print("evaluating", model_name)
+        model = LateInteractionMultimodalEmbedding(model_name=model_name)
+        result = next(iter(model.embed_text(queries)))
+        token_num, abridged_dim = expected_result.shape
+        assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
diff --git a/tests/test_sparse_embeddings.py b/tests/test_sparse_embeddings.py
@@ -5,10 +5,10 @@
 
 from fastembed.sparse.bm25 import Bm25
 from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding
-from tests.utils import delete_model_cache
+from tests.utils import delete_model_cache, should_test_model
 
 CANONICAL_COLUMN_VALUES = {
-    "prithvida/Splade_PP_en_v1": {
+    "prithivida/Splade_PP_en_v1": {
         "indices": [
             2040,
             2047,
@@ -49,28 +49,41 @@
 docs = ["Hello World"]
 
 
-def test_batch_embedding() -> None:
+@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
+def test_batch_embedding(model_name: str) -> None:
     is_ci = os.getenv("CI")
     docs_to_embed = docs * 10
 
-    for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
-        model = SparseTextEmbedding(model_name=model_name)
-        result = next(iter(model.embed(docs_to_embed, batch_size=6)))
-        assert result.indices.tolist() == expected_result["indices"]
+    model = SparseTextEmbedding(model_name=model_name)
+    result = next(iter(model.embed(docs_to_embed, batch_size=6)))
+    expected_result = CANONICAL_COLUMN_VALUES[model_name]
+    assert result.indices.tolist() == expected_result["indices"]
 
-        for i, value in enumerate(result.values):
-            assert pytest.approx(value, abs=0.001) == expected_result["values"][i]
-        if is_ci:
-            delete_model_cache(model.model._model_dir)
+    for i, value in enumerate(result.values):
+        assert pytest.approx(value, abs=0.001) == expected_result["values"][i]
+    if is_ci:
+        delete_model_cache(model.model._model_dir)
 
 
-def test_single_embedding() -> None:
+@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
+def test_single_embedding(model_name: str) -> None:
     is_ci = os.getenv("CI")
-    for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
+    is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"
+
+    for model_desc in SparseTextEmbedding._list_supported_models():
+        if (
+            model_desc.model not in CANONICAL_COLUMN_VALUES
+        ):  # attention models and bm25 are also parts of
+            # SparseTextEmbedding, however, they have their own tests
+            continue
+        if not should_test_model(model_desc, model_name, is_ci, is_manual):
+            continue
+
         model = SparseTextEmbedding(model_name=model_name)
 
         passage_result = next(iter(model.embed(docs, batch_size=6)))
         query_result = next(iter(model.query_embed(docs)))
+        expected_result = CANONICAL_COLUMN_VALUES[model_name]
         for result in [passage_result, query_result]:
             assert result.indices.tolist() == expected_result["indices"]
 
@@ -80,9 +93,10 @@ def test_single_embedding() -> None:
             delete_model_cache(model.model._model_dir)
 
 
-def test_parallel_processing() -> None:
+@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
+def test_parallel_processing(model_name: str) -> None:
     is_ci = os.getenv("CI")
-    model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
+    model = SparseTextEmbedding(model_name=model_name)
     docs = ["hello world", "flag embedding"] * 30
     sparse_embeddings_duo = list(model.embed(docs, batch_size=10, parallel=2))
     sparse_embeddings_all = list(model.embed(docs, batch_size=10, parallel=0))
@@ -172,10 +186,7 @@ def test_disable_stemmer_behavior(disable_stemmer: bool) -> None:
     assert result == expected, f"Expected {expected}, but got {result}"
 
 
-@pytest.mark.parametrize(
-    "model_name",
-    ["prithivida/Splade_PP_en_v1"],
-)
+@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
 def test_lazy_load(model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = SparseTextEmbedding(model_name=model_name, lazy_load=True)