Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,20 @@ jobs:
- ubuntu-latest
- macos-latest
- windows-latest
exclude:
# Exclude 3.10–3.12 for macOS and Windows
- os: macos-latest
python-version: '3.10.x'
- os: macos-latest
python-version: '3.11.x'
- os: macos-latest
python-version: '3.12.x'
- os: windows-latest
python-version: '3.10.x'
- os: windows-latest
python-version: '3.11.x'
- os: windows-latest
python-version: '3.12.x'

runs-on: ${{ matrix.os }}

Expand Down
12 changes: 10 additions & 2 deletions fastembed/sparse/bm42.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,17 @@
),
]

MODEL_TO_LANGUAGE = {

_MODEL_TO_LANGUAGE = {
"Qdrant/bm42-all-minilm-l6-v2-attentions": "english",
}
MODEL_TO_LANGUAGE = {
model_name.lower(): language for model_name, language in _MODEL_TO_LANGUAGE.items()
}


def get_language_by_model_name(model_name: str) -> str:
return MODEL_TO_LANGUAGE[model_name.lower()]


class Bm42(SparseTextEmbeddingBase, OnnxTextModel[SparseEmbedding]):
Expand Down Expand Up @@ -124,7 +132,7 @@ def __init__(
self.special_tokens_ids: set[int] = set()
self.punctuation = set(string.punctuation)
self.stopwords = set(self._load_stopwords(self._model_dir))
self.stemmer = SnowballStemmer(MODEL_TO_LANGUAGE[model_name])
self.stemmer = SnowballStemmer(get_language_by_model_name(self.model_name))
self.alpha = alpha

if not self.lazy_load:
Expand Down
11 changes: 9 additions & 2 deletions fastembed/sparse/minicoil.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,16 @@
),
]

MODEL_TO_LANGUAGE = {
_MODEL_TO_LANGUAGE = {
"Qdrant/minicoil-v1": "english",
}
MODEL_TO_LANGUAGE = {
model_name.lower(): language for model_name, language in _MODEL_TO_LANGUAGE.items()
}


def get_language_by_model_name(model_name: str) -> str:
return MODEL_TO_LANGUAGE[model_name.lower()]


class MiniCOIL(SparseTextEmbeddingBase, OnnxTextModel[SparseEmbedding]):
Expand Down Expand Up @@ -156,7 +163,7 @@ def load_onnx_model(self) -> None:
self.special_tokens_ids = set(self.special_token_to_id.values())
self.stopwords = set(self._load_stopwords(self._model_dir))

stemmer = SnowballStemmer(MODEL_TO_LANGUAGE[self.model_name])
stemmer = SnowballStemmer(get_language_by_model_name(self.model_name))

self.vocab_resolver = VocabResolver(
tokenizer=VocabTokenizer(self.tokenizer),
Expand Down
221 changes: 117 additions & 104 deletions tests/test_attention_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from contextlib import contextmanager

import numpy as np
import pytest
Expand All @@ -7,98 +8,119 @@
from tests.utils import delete_model_cache


@pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25"])
def test_attention_embeddings(model_name: str) -> None:
is_ci = os.getenv("CI")
model = SparseTextEmbedding(model_name=model_name)

output = list(
model.query_embed(
[
"I must not fear. Fear is the mind-killer.",
]
)
)

assert len(output) == 1

for result in output:
assert len(result.indices) == len(result.values)
assert np.allclose(result.values, np.ones(len(result.values)))

quotes = [
"I must not fear. Fear is the mind-killer.",
"All animals are equal, but some animals are more equal than others.",
"It was a pleasure to burn.",
"The sky above the port was the color of television, tuned to a dead channel.",
"In the beginning, the universe was created."
" This has made a lot of people very angry and been widely regarded as a bad move.",
"It's a truth universally acknowledged that a zombie in possession of brains must be in want of more brains.",
"War is peace. Freedom is slavery. Ignorance is strength.",
"We're not in Infinity; we're in the suburbs.",
"I was a thousand times more evil than thou!",
"History is merely a list of surprises... It can only prepare us to be surprised yet again.",
".", # Empty string
]

output = list(model.embed(quotes))

assert len(output) == len(quotes)

for result in output[:-1]:
assert len(result.indices) == len(result.values)
assert len(result.indices) > 0

assert len(output[-1].indices) == 0

# Test support for unknown languages
output = list(
model.query_embed(
[
"привет мир!",
]
)
)
_MODELS_TO_CACHE = ("Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25")
MODELS_TO_CACHE = tuple([x.lower() for x in _MODELS_TO_CACHE])

assert len(output) == 1

for result in output:
assert len(result.indices) == len(result.values)
assert len(result.indices) == 2
@pytest.fixture(scope="module")
def model_cache():
is_ci = os.getenv("CI")
cache = {}

@contextmanager
def get_model(model_name: str):
lowercase_model_name = model_name.lower()
if lowercase_model_name not in cache:
cache[lowercase_model_name] = SparseTextEmbedding(lowercase_model_name)
yield cache[lowercase_model_name]
Comment on lines +22 to +25
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Do not lowercase the model ID before constructing SparseTextEmbedding

Lowercasing the Hugging Face ID makes the registry lookup fail—SparseTextEmbedding("qdrant/bm25") raises because the canonical key is "Qdrant/bm25". That’s why CI now reports failures in this module. Keep the cache keyed by lowercase if you like, but instantiate with the original model_name so the loader still finds the model.

Apply this diff:

-        if lowercase_model_name not in cache:
-            cache[lowercase_model_name] = SparseTextEmbedding(lowercase_model_name)
+        if lowercase_model_name not in cache:
+            cache[lowercase_model_name] = SparseTextEmbedding(model_name)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
lowercase_model_name = model_name.lower()
if lowercase_model_name not in cache:
cache[lowercase_model_name] = SparseTextEmbedding(lowercase_model_name)
yield cache[lowercase_model_name]
lowercase_model_name = model_name.lower()
if lowercase_model_name not in cache:
cache[lowercase_model_name] = SparseTextEmbedding(model_name)
yield cache[lowercase_model_name]
🧰 Tools
🪛 GitHub Actions: Tests

[error] Pytest run detected 2 test failures in this file.


[error] Pytest run detected additional test failures (overall fail state reported in test suite).

🤖 Prompt for AI Agents
In tests/test_attention_embeddings.py around lines 22 to 25, the code lowercases
model_name before constructing SparseTextEmbedding which breaks model registry
lookups (e.g., "Qdrant/bm25"); change it so the cache key remains lowercase but
pass the original model_name (not lowercased) when instantiating
SparseTextEmbedding, i.e., use cache[lowercase_model_name] =
SparseTextEmbedding(model_name) while keeping cache keyed by
lowercase_model_name.

if lowercase_model_name not in MODELS_TO_CACHE:
print("deleting model")
model_inst = cache.pop(lowercase_model_name)
if is_ci:
delete_model_cache(model_inst.model._model_dir)
del model_inst

yield get_model

if is_ci:
delete_model_cache(model.model._model_dir)
for name, model in cache.items():
delete_model_cache(model.model._model_dir)
cache.clear()


@pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25"])
def test_parallel_processing(model_name: str) -> None:
is_ci = os.getenv("CI")
def test_attention_embeddings(model_cache, model_name: str) -> None:
with model_cache(model_name) as model:
output = list(
model.query_embed(
[
"I must not fear. Fear is the mind-killer.",
]
)
)

model = SparseTextEmbedding(model_name=model_name)
assert len(output) == 1

for result in output:
assert len(result.indices) == len(result.values)
assert np.allclose(result.values, np.ones(len(result.values)))

quotes = [
"I must not fear. Fear is the mind-killer.",
"All animals are equal, but some animals are more equal than others.",
"It was a pleasure to burn.",
"The sky above the port was the color of television, tuned to a dead channel.",
"In the beginning, the universe was created."
" This has made a lot of people very angry and been widely regarded as a bad move.",
Comment on lines +63 to +64
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Restore missing comma between adjacent literals

The two strings are now concatenated by Python because there’s no comma between them, so we silently lose a separate test input. Please add the comma back so both quotes are exercised independently.

-            "In the beginning, the universe was created."
-            " This has made a lot of people very angry and been widely regarded as a bad move.",
+            "In the beginning, the universe was created.",
+            " This has made a lot of people very angry and been widely regarded as a bad move.",
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
"In the beginning, the universe was created."
" This has made a lot of people very angry and been widely regarded as a bad move.",
"In the beginning, the universe was created.",
" This has made a lot of people very angry and been widely regarded as a bad move.",
🤖 Prompt for AI Agents
In tests/test_attention_embeddings.py around lines 61 to 62 the two adjacent
string literals are missing a separating comma and are being implicitly
concatenated; restore the missing comma between the two quoted strings so they
become two separate list entries (i.e., insert a comma after the first string)
so both test inputs are preserved and exercised independently.

"It's a truth universally acknowledged that a zombie in possession of brains must be in want of more brains.",
"War is peace. Freedom is slavery. Ignorance is strength.",
"We're not in Infinity; we're in the suburbs.",
"I was a thousand times more evil than thou!",
"History is merely a list of surprises... It can only prepare us to be surprised yet again.",
".", # Empty string
]

output = list(model.embed(quotes))

assert len(output) == len(quotes)

for result in output[:-1]:
assert len(result.indices) == len(result.values)
assert len(result.indices) > 0

assert len(output[-1].indices) == 0

# Test support for unknown languages
output = list(
model.query_embed(
[
"привет мир!",
]
)
)

docs = ["hello world", "attention embedding", "Mangez-vous vraiment des grenouilles?"] * 100
embeddings = list(model.embed(docs, batch_size=10, parallel=2))
assert len(output) == 1

embeddings_2 = list(model.embed(docs, batch_size=10, parallel=None))
for result in output:
assert len(result.indices) == len(result.values)
assert len(result.indices) == 2

embeddings_3 = list(model.embed(docs, batch_size=10, parallel=0))

assert len(embeddings) == len(docs)
@pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25"])
def test_parallel_processing(model_cache, model_name: str) -> None:
with model_cache(model_name) as model:
docs = [
"hello world",
"attention embedding",
"Mangez-vous vraiment des grenouilles?",
] * 100
embeddings = list(model.embed(docs, batch_size=10, parallel=2))

for emb_1, emb_2, emb_3 in zip(embeddings, embeddings_2, embeddings_3):
assert np.allclose(emb_1.indices, emb_2.indices)
assert np.allclose(emb_1.indices, emb_3.indices)
assert np.allclose(emb_1.values, emb_2.values)
assert np.allclose(emb_1.values, emb_3.values)
embeddings_2 = list(model.embed(docs, batch_size=10, parallel=None))

if is_ci:
delete_model_cache(model.model._model_dir)
embeddings_3 = list(model.embed(docs, batch_size=10, parallel=0))

assert len(embeddings) == len(docs)

for emb_1, emb_2, emb_3 in zip(embeddings, embeddings_2, embeddings_3):
assert np.allclose(emb_1.indices, emb_2.indices)
assert np.allclose(emb_1.indices, emb_3.indices)
assert np.allclose(emb_1.values, emb_2.values)
assert np.allclose(emb_1.values, emb_3.values)

@pytest.mark.parametrize("model_name", ["Qdrant/bm25"])
def test_multilanguage(model_name: str) -> None:
is_ci = os.getenv("CI")

@pytest.mark.parametrize("model_name", ["Qdrant/bm25"])
def test_multilanguage(model_cache, model_name: str) -> None:
docs = ["Mangez-vous vraiment des grenouilles?", "Je suis au lit"]

model = SparseTextEmbedding(model_name=model_name, language="french")
Expand All @@ -109,39 +131,30 @@ def test_multilanguage(model_name: str) -> None:
assert embeddings[1].values.shape == (1,)
assert embeddings[1].indices.shape == (1,)

model = SparseTextEmbedding(model_name=model_name, language="english")
embeddings = list(model.embed(docs))[:2]
assert embeddings[0].values.shape == (5,)
assert embeddings[0].indices.shape == (5,)
with model_cache(model_name) as model: # language = "english"
embeddings = list(model.embed(docs))[:2]
assert embeddings[0].values.shape == (5,)
assert embeddings[0].indices.shape == (5,)

assert embeddings[1].values.shape == (4,)
assert embeddings[1].indices.shape == (4,)

if is_ci:
delete_model_cache(model.model._model_dir)
assert embeddings[1].values.shape == (4,)
assert embeddings[1].indices.shape == (4,)


@pytest.mark.parametrize("model_name", ["Qdrant/bm25"])
def test_special_characters(model_name: str) -> None:
is_ci = os.getenv("CI")

docs = [
"Über den größten Flüssen Österreichs äußern sich Experten häufig: Öko-Systeme müssen geschützt werden!",
"L'élève français s'écrie : « Où est mon crayon ? J'ai besoin de finir cet exercice avant la récréation!",
"Într-o zi însorită, Ștefan și Ioana au mâncat mămăligă cu brânză și au băut țuică la cabană.",
"Üzgün öğretmen öğrencilere seslendi: Lütfen gürültü yapmayın, sınavınızı bitirmeye çalışıyorum!",
"Ο Ξενοφών είπε: «Ψάχνω για ένα ωραίο δώρο για τη γιαγιά μου. Ίσως ένα φυτό ή ένα βιβλίο;»",
"Hola! ¿Cómo estás? Estoy muy emocionado por el cumpleaños de mi hermano, ¡va a ser increíble! También quiero comprar un pastel de chocolate con fresas y un regalo especial: un libro titulado «Cien años de soledad",
]

model = SparseTextEmbedding(model_name=model_name, language="english")
embeddings = list(model.embed(docs))
for idx, shape in enumerate([14, 18, 15, 10, 15]):
assert embeddings[idx].values.shape == (shape,)
assert embeddings[idx].indices.shape == (shape,)

if is_ci:
delete_model_cache(model.model._model_dir)
def test_special_characters(model_cache, model_name: str) -> None:
with model_cache(model_name) as model:
docs = [
"Über den größten Flüssen Österreichs äußern sich Experten häufig: Öko-Systeme müssen geschützt werden!",
"L'élève français s'écrie : « Où est mon crayon ? J'ai besoin de finir cet exercice avant la récréation!",
"Într-o zi însorită, Ștefan și Ioana au mâncat mămăligă cu brânză și au băut țuică la cabană.",
"Üzgün öğretmen öğrencilere seslendi: Lütfen gürültü yapmayın, sınavınızı bitirmeye çalışıyorum!",
"Ο Ξενοφών είπε: «Ψάχνω για ένα ωραίο δώρο για τη γιαγιά μου. Ίσως ένα φυτό ή ένα βιβλίο;»",
"Hola! ¿Cómo estás? Estoy muy emocionado por el cumpleaños de mi hermano, ¡va a ser increíble! También quiero comprar un pastel de chocolate con fresas y un regalo especial: un libro titulado «Cien años de soledad",
]
embeddings = list(model.embed(docs))
for idx, shape in enumerate([14, 18, 15, 10, 15]):
assert embeddings[idx].values.shape == (shape,)
assert embeddings[idx].indices.shape == (shape,)


@pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions"])
Expand Down
14 changes: 14 additions & 0 deletions tests/test_custom_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,13 @@ def test_text_custom_model():
assert embeddings.shape == (2, dim)

assert np.allclose(embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3)

if is_ci:
delete_model_cache(model.model._model_dir)

CustomTextEmbedding.SUPPORTED_MODELS.clear()
CustomTextEmbedding.POSTPROCESSING_MAPPING.clear()


def test_cross_encoder_custom_model():
is_ci = os.getenv("CI")
Expand Down Expand Up @@ -110,6 +114,8 @@ def test_cross_encoder_custom_model():
if is_ci:
delete_model_cache(model.model._model_dir)

CustomTextCrossEncoder.SUPPORTED_MODELS.clear()


def test_mock_add_custom_models():
dim = 5
Expand Down Expand Up @@ -169,6 +175,9 @@ def test_mock_add_custom_models():
)
assert np.allclose(post_processed_output, expected_output[model_name], atol=1e-3)

CustomTextEmbedding.SUPPORTED_MODELS.clear()
CustomTextEmbedding.POSTPROCESSING_MAPPING.clear()


def test_do_not_add_existing_model():
existing_base_model = "sentence-transformers/all-MiniLM-L6-v2"
Expand Down Expand Up @@ -203,6 +212,9 @@ def test_do_not_add_existing_model():
size_in_gb=0.47,
)

CustomTextEmbedding.SUPPORTED_MODELS.clear()
CustomTextEmbedding.POSTPROCESSING_MAPPING.clear()


def test_do_not_add_existing_cross_encoder():
existing_base_model = "Xenova/ms-marco-MiniLM-L-6-v2"
Expand All @@ -227,3 +239,5 @@ def test_do_not_add_existing_cross_encoder():
sources=ModelSource(hf=custom_model_name),
size_in_gb=0.08,
)

CustomTextCrossEncoder.SUPPORTED_MODELS.clear()
Loading