[ENH] Add chroma_bm25 embedding function to python (#5806)

jairad26 · web-flow · commit a7575cc52c22 · 2025-11-04T14:33:59.000-08:00
## Description of changes _Summarize the changes made by this PR._ - Improvements & Bug fixes - Add chroma_bm25 ef to python client. it matches the rust bm25 impl, and is 100% compatible with the js client chroma_bm25 embedding function - New functionality - ... ## Test plan _How are these changes tested?_ added new tests to ensure the results from this ef match those of rust and js manually created a collection in staging with this client, and then did getCollection from js to ensure that they are compatible both ways. they correctly embed as expected. - [ x] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust ## Migration plan _Are there any migrations, or any forwards/backwards compatibility changes needed in order to make sure this change deploys reliably?_ ## Observability plan _What is the plan to instrument and monitor this change?_ ## Documentation Changes _Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs section](https://github.com/chroma-core/chroma/tree/main/docs/docs.trychroma.com)?_
diff --git a/chromadb/test/ef/test_chroma_bm25_embedding_function.py b/chromadb/test/ef/test_chroma_bm25_embedding_function.py
@@ -0,0 +1,139 @@
+import math
+
+import pytest
+
+from chromadb.utils.embedding_functions.chroma_bm25_embedding_function import (
+    DEFAULT_CHROMA_BM25_STOPWORDS,
+    ChromaBm25EmbeddingFunction,
+)
+
+
+def _is_sorted(values: list[int]) -> bool:
+    return all(values[i] >= values[i - 1] for i in range(1, len(values)))
+
+
+def test_comprehensive_tokenization_matches_reference() -> None:
+    embedder = ChromaBm25EmbeddingFunction()
+    embedding = embedder(
+        [
+            "Usain Bolt's top speed reached ~27.8 mph (44.72 km/h)",
+        ]
+    )[0]
+
+    expected_indices = [
+        230246813,
+        395514983,
+        458027949,
+        488165615,
+        729632045,
+        734978415,
+        997512866,
+        1114505193,
+        1381820790,
+        1501587190,
+        1649421877,
+        1837285388,
+    ]
+    expected_value = 1.6391153
+
+    assert embedding.indices == expected_indices
+    for value in embedding.values:
+        assert value == pytest.approx(expected_value, abs=1e-5)
+
+
+def test_matches_rust_reference_values() -> None:
+    embedder = ChromaBm25EmbeddingFunction()
+    embedding = embedder(
+        [
+            "The   space-time   continuum   WARPS   near   massive   objects...",
+        ]
+    )[0]
+
+    expected_indices = [
+        90097469,
+        519064992,
+        737893654,
+        1110755108,
+        1950894484,
+        2031641008,
+        2058513491,
+    ]
+    expected_value = 1.660867
+
+    assert embedding.indices == expected_indices
+    for value in embedding.values:
+        assert value == pytest.approx(expected_value, abs=1e-5)
+
+
+def test_generates_embeddings_for_multiple_documents() -> None:
+    embedder = ChromaBm25EmbeddingFunction()
+    texts = [
+        "Usain Bolt's top speed reached ~27.8 mph (44.72 km/h)",
+        "The   space-time   continuum   WARPS   near   massive   objects...",
+        "BM25 is great for sparse retrieval tasks",
+    ]
+
+    embeddings = embedder(texts)
+
+    assert len(embeddings) == len(texts)
+    for embedding in embeddings:
+        assert embedding.indices
+        assert len(embedding.indices) == len(embedding.values)
+        assert _is_sorted(embedding.indices)
+        for value in embedding.values:
+            assert value > 0
+            assert math.isfinite(value)
+
+
+def test_embed_query_matches_call() -> None:
+    embedder = ChromaBm25EmbeddingFunction()
+    query = "retrieve BM25 docs"
+
+    query_embedding = embedder.embed_query([query])[0]
+    doc_embedding = embedder([query])[0]
+
+    assert query_embedding.indices == doc_embedding.indices
+    assert query_embedding.values == doc_embedding.values
+
+
+def test_config_round_trip() -> None:
+    embedder = ChromaBm25EmbeddingFunction()
+    config = embedder.get_config()
+
+    assert config["k"] == pytest.approx(1.2, abs=1e-9)
+    assert config["b"] == pytest.approx(0.75, abs=1e-9)
+    assert config["avg_doc_length"] == pytest.approx(256.0, abs=1e-9)
+    assert config["token_max_length"] == 40
+    assert "stopwords" not in config
+
+    custom_stopwords = DEFAULT_CHROMA_BM25_STOPWORDS[:10]
+    rebuilt = ChromaBm25EmbeddingFunction.build_from_config(
+        {
+            **config,
+            "stopwords": custom_stopwords,
+        }
+    )
+
+    rebuilt_config = rebuilt.get_config()
+    assert rebuilt_config["stopwords"] == custom_stopwords
+    assert rebuilt_config["token_max_length"] == config["token_max_length"]
+    assert rebuilt_config["k"] == pytest.approx(config["k"], abs=1e-9)
+    assert rebuilt_config["b"] == pytest.approx(config["b"], abs=1e-9)
+    assert rebuilt_config["avg_doc_length"] == pytest.approx(
+        config["avg_doc_length"], abs=1e-9
+    )
+
+
+def test_validate_config_update_rejects_unknown_keys() -> None:
+    embedder = ChromaBm25EmbeddingFunction()
+
+    with pytest.raises(ValueError):
+        embedder.validate_config_update(embedder.get_config(), {"unknown": 123})
+
+
+def test_validate_config_update_allows_known_keys() -> None:
+    embedder = ChromaBm25EmbeddingFunction()
+
+    embedder.validate_config_update(
+        embedder.get_config(), {"k": 1.1, "stopwords": ["custom"]}
+    )
diff --git a/chromadb/test/ef/test_ef.py b/chromadb/test/ef/test_ef.py
@@ -54,6 +54,7 @@ def test_get_builtins_holds() -> None:
         "Bm25EmbeddingFunction",
         "ChromaCloudQwenEmbeddingFunction",
         "ChromaCloudSpladeEmbeddingFunction",
+        "ChromaBm25EmbeddingFunction",
     }
 
     assert expected_builtins == embedding_functions.get_builtins()
diff --git a/chromadb/utils/embedding_functions/__init__.py b/chromadb/utils/embedding_functions/__init__.py
@@ -83,6 +83,9 @@
 from chromadb.utils.embedding_functions.chroma_cloud_splade_embedding_function import (
     ChromaCloudSpladeEmbeddingFunction,
 )
+from chromadb.utils.embedding_functions.chroma_bm25_embedding_function import (
+    ChromaBm25EmbeddingFunction,
+)
 
 
 # Get all the class names for backward compatibility
@@ -116,6 +119,7 @@
     "Bm25EmbeddingFunction",
     "ChromaCloudQwenEmbeddingFunction",
     "ChromaCloudSpladeEmbeddingFunction",
+    "ChromaBm25EmbeddingFunction",
 }
 
 
@@ -157,6 +161,7 @@ def get_builtins() -> Set[str]:
     "fastembed_sparse": FastembedSparseEmbeddingFunction,
     "bm25": Bm25EmbeddingFunction,
     "chroma-cloud-splade": ChromaCloudSpladeEmbeddingFunction,
+    "chroma_bm25": ChromaBm25EmbeddingFunction,
 }
 
 
@@ -273,6 +278,7 @@ def config_to_embedding_function(config: Dict[str, Any]) -> EmbeddingFunction:
     "Bm25EmbeddingFunction",
     "ChromaCloudQwenEmbeddingFunction",
     "ChromaCloudSpladeEmbeddingFunction",
+    "ChromaBm25EmbeddingFunction",
     "register_embedding_function",
     "config_to_embedding_function",
     "known_embedding_functions",
diff --git a/chromadb/utils/embedding_functions/bm25_embedding_function.py b/chromadb/utils/embedding_functions/bm25_embedding_function.py
@@ -5,6 +5,7 @@
 )
 from typing import Dict, Any, TypedDict, Optional
 from typing import cast, Literal
+import warnings
 from chromadb.utils.embedding_functions.schemas import validate_config_schema
 from chromadb.utils.sparse_embedding_utils import normalize_sparse_vector
 
@@ -45,6 +46,11 @@ def __init__(
             query_config (dict, optional): Configuration for the query, can be "task"
             **kwargs: Additional arguments to pass to the Bm25 model.
         """
+        warnings.warn(
+            "Bm25EmbeddingFunction is deprecated. Please use ChromaBm25EmbeddingFunction instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         try:
             from fastembed.sparse.bm25 import Bm25
         except ImportError:
diff --git a/chromadb/utils/embedding_functions/chroma_bm25_embedding_function.py b/chromadb/utils/embedding_functions/chroma_bm25_embedding_function.py
@@ -0,0 +1,135 @@
+from __future__ import annotations
+
+from collections import Counter
+from typing import Any, Dict, Iterable, List, Optional, TypedDict
+
+from chromadb.api.types import Documents, SparseEmbeddingFunction, SparseVectors
+from chromadb.base_types import SparseVector
+from chromadb.utils.embedding_functions.schemas import validate_config_schema
+from chromadb.utils.embedding_functions.schemas.bm25_tokenizer import (
+    Bm25Tokenizer,
+    DEFAULT_CHROMA_BM25_STOPWORDS as _DEFAULT_STOPWORDS,
+    get_english_stemmer,
+    Murmur3AbsHasher,
+)
+
+NAME = "chroma_bm25"
+
+DEFAULT_K = 1.2
+DEFAULT_B = 0.75
+DEFAULT_AVG_DOC_LENGTH = 256.0
+DEFAULT_TOKEN_MAX_LENGTH = 40
+
+DEFAULT_CHROMA_BM25_STOPWORDS: List[str] = list(_DEFAULT_STOPWORDS)
+
+
+class ChromaBm25Config(TypedDict, total=False):
+    k: float
+    b: float
+    avg_doc_length: float
+    token_max_length: int
+    stopwords: List[str]
+
+
+class ChromaBm25EmbeddingFunction(SparseEmbeddingFunction[Documents]):
+    def __init__(
+        self,
+        k: float = DEFAULT_K,
+        b: float = DEFAULT_B,
+        avg_doc_length: float = DEFAULT_AVG_DOC_LENGTH,
+        token_max_length: int = DEFAULT_TOKEN_MAX_LENGTH,
+        stopwords: Optional[Iterable[str]] = None,
+    ) -> None:
+        """Initialize the BM25 sparse embedding function."""
+
+        self.k = float(k)
+        self.b = float(b)
+        self.avg_doc_length = float(avg_doc_length)
+        self.token_max_length = int(token_max_length)
+
+        if stopwords is not None:
+            self.stopwords: Optional[List[str]] = [str(word) for word in stopwords]
+            stopword_list: Iterable[str] = self.stopwords
+        else:
+            self.stopwords = None
+            stopword_list = DEFAULT_CHROMA_BM25_STOPWORDS
+
+        stemmer = get_english_stemmer()
+        self._tokenizer = Bm25Tokenizer(stemmer, stopword_list, self.token_max_length)
+        self._hasher = Murmur3AbsHasher()
+
+    def _encode(self, text: str) -> SparseVector:
+        tokens = self._tokenizer.tokenize(text)
+
+        if not tokens:
+            return SparseVector(indices=[], values=[])
+
+        doc_len = float(len(tokens))
+        counts = Counter(self._hasher.hash(token) for token in tokens)
+
+        indices = sorted(counts.keys())
+        values: List[float] = []
+        for idx in indices:
+            tf = float(counts[idx])
+            denominator = tf + self.k * (
+                1 - self.b + (self.b * doc_len) / self.avg_doc_length
+            )
+            score = tf * (self.k + 1) / denominator
+            values.append(score)
+
+        return SparseVector(indices=indices, values=values)
+
+    def __call__(self, input: Documents) -> SparseVectors:
+        sparse_vectors: SparseVectors = []
+
+        if not input:
+            return sparse_vectors
+
+        for document in input:
+            sparse_vectors.append(self._encode(document))
+
+        return sparse_vectors
+
+    def embed_query(self, input: Documents) -> SparseVectors:
+        return self.__call__(input)
+
+    @staticmethod
+    def name() -> str:
+        return NAME
+
+    @staticmethod
+    def build_from_config(
+        config: Dict[str, Any]
+    ) -> "SparseEmbeddingFunction[Documents]":
+        return ChromaBm25EmbeddingFunction(
+            k=config.get("k", DEFAULT_K),
+            b=config.get("b", DEFAULT_B),
+            avg_doc_length=config.get("avg_doc_length", DEFAULT_AVG_DOC_LENGTH),
+            token_max_length=config.get("token_max_length", DEFAULT_TOKEN_MAX_LENGTH),
+            stopwords=config.get("stopwords"),
+        )
+
+    def get_config(self) -> Dict[str, Any]:
+        config: Dict[str, Any] = {
+            "k": self.k,
+            "b": self.b,
+            "avg_doc_length": self.avg_doc_length,
+            "token_max_length": self.token_max_length,
+        }
+
+        if self.stopwords is not None:
+            config["stopwords"] = list(self.stopwords)
+
+        return config
+
+    def validate_config_update(
+        self, old_config: Dict[str, Any], new_config: Dict[str, Any]
+    ) -> None:
+        mutable_keys = {"k", "b", "avg_doc_length", "token_max_length", "stopwords"}
+        for key in new_config:
+            if key not in mutable_keys:
+                raise ValueError(f"Updating '{key}' is not supported for {NAME}")
+
+    @staticmethod
+    def validate_config(config: Dict[str, Any]) -> None:
+        validate_config_schema(config, NAME)
diff --git a/chromadb/utils/embedding_functions/schemas/bm25_tokenizer.py b/chromadb/utils/embedding_functions/schemas/bm25_tokenizer.py
diff --git a/requirements_dev.txt b/requirements_dev.txt

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ def test_get_builtins_holds() -> None:`
`54`	`54`	`"Bm25EmbeddingFunction",`
`55`	`55`	`"ChromaCloudQwenEmbeddingFunction",`
`56`	`56`	`"ChromaCloudSpladeEmbeddingFunction",`
	`57`	`+ "ChromaBm25EmbeddingFunction",`
`57`	`58`	`}`
`58`	`59`
`59`	`60`	`assert expected_builtins == embedding_functions.get_builtins()`