Skip to content

Commit ff87da3

Browse files
committed
add chroma
1 parent 6bab477 commit ff87da3

File tree

13 files changed

+354
-2
lines changed

13 files changed

+354
-2
lines changed

benchmark/convert.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def main():
6060
data = json.load(open(search_file))
6161
experiment_name = data["params"]["experiment"]
6262
dataset_name = data["params"]["dataset"]
63-
engine_params = data["params"]["config"]
63+
engine_params = data["params"].get("config", {})
6464
parallel = data["params"]["parallel"]
6565
engine_name = data["params"]["engine"]
6666

engine/clients/chroma/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from engine.clients.chroma.configure import ChromaConfigurator
2+
from engine.clients.chroma.search import ChromaSearcher
3+
from engine.clients.chroma.upload import ChromaUploader
4+
5+
__all__ = [
6+
"ChromaConfigurator",
7+
"ChromaSearcher",
8+
"ChromaUploader",
9+
]

engine/clients/chroma/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import os
2+
3+
CHROMA_COLLECTION_NAME = os.getenv("CHROMA_COLLECTION_NAME", "benchmark")
4+
5+
6+
def chroma_fix_host(host: str):
7+
return host if host != "localhost" else "127.0.0.1"

engine/clients/chroma/configure.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from chromadb import HttpClient, Settings
2+
3+
from benchmark.dataset import Dataset
4+
from engine.base_client.configure import BaseConfigurator
5+
from engine.base_client.distances import Distance
6+
from engine.clients.chroma.config import CHROMA_COLLECTION_NAME, chroma_fix_host
7+
8+
9+
class ChromaConfigurator(BaseConfigurator):
10+
11+
DISTANCE_MAPPING = {
12+
Distance.L2: "l2",
13+
Distance.COSINE: "cosine",
14+
Distance.DOT: "ip",
15+
}
16+
17+
def __init__(self, host, collection_params: dict, connection_params: dict):
18+
super().__init__(host, collection_params, connection_params)
19+
self.client = HttpClient(
20+
host=chroma_fix_host(host),
21+
settings=Settings(allow_reset=True, anonymized_telemetry=False),
22+
**connection_params,
23+
)
24+
25+
def clean(self):
26+
"""
27+
Delete a collection and all associated embeddings, documents, and metadata.
28+
29+
This is destructive and not reversible.
30+
"""
31+
try:
32+
self.client.delete_collection(name=CHROMA_COLLECTION_NAME)
33+
except (Exception, ValueError):
34+
pass
35+
36+
def recreate(self, dataset: Dataset, collection_params):
37+
params = self.collection_params
38+
params["metadata"] = dict({"hnsw:space": self.DISTANCE_MAPPING.get(dataset.config.distance)}, **params.get('metadata', {}))
39+
self.client.create_collection(
40+
name=CHROMA_COLLECTION_NAME,
41+
**params,
42+
)

engine/clients/chroma/parser.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from typing import List, Optional
2+
3+
from chromadb import Where
4+
from chromadb.types import OperatorExpression
5+
6+
from engine.base_client import IncompatibilityError
7+
from engine.base_client.parser import BaseConditionParser, FieldValue
8+
9+
10+
class ChromaConditionParser(BaseConditionParser):
11+
def build_condition(
12+
self, and_subfilters: Optional[List[Where]], or_subfilters: Optional[List[Where]]
13+
) -> Where:
14+
condition: Where = {}
15+
if and_subfilters is not None:
16+
if len(and_subfilters) >= 2:
17+
condition["$and"] = and_subfilters
18+
elif len(and_subfilters) == 1:
19+
condition = {**condition, **and_subfilters[0]}
20+
21+
if or_subfilters is not None:
22+
if len(or_subfilters) >= 2:
23+
condition["$or"] = or_subfilters
24+
elif len(or_subfilters) == 1:
25+
condition = {**condition, **or_subfilters[0]}
26+
27+
return condition
28+
# return {k: v for d in [flt for xs in [and_subfilters, or_subfilters] for flt in xs] for k, v in d.items()}
29+
30+
def build_exact_match_filter(self, field_name: str, value: FieldValue) -> Where:
31+
return {field_name: value}
32+
33+
def build_range_filter(
34+
self,
35+
field_name: str,
36+
lt: Optional[FieldValue],
37+
gt: Optional[FieldValue],
38+
lte: Optional[FieldValue],
39+
gte: Optional[FieldValue],
40+
) -> Where:
41+
raw_filters: OperatorExpression = {
42+
"$lt": lt,
43+
"$gt": gt,
44+
"$lte": lte,
45+
"$gte": gte,
46+
}
47+
filters: OperatorExpression = {k: v for k, v in raw_filters.items() if v is not None}
48+
return {field_name: filters}
49+
50+
def build_geo_filter(
51+
self, field_name: str, lat: float, lon: float, radius: float
52+
) -> Where:
53+
raise IncompatibilityError

engine/clients/chroma/search.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from typing import List, Tuple
2+
3+
from chromadb import HttpClient, ClientAPI, Settings
4+
from chromadb.api.types import IncludeEnum
5+
6+
from dataset_reader.base_reader import Query
7+
from engine.base_client.search import BaseSearcher
8+
from engine.clients.chroma.config import CHROMA_COLLECTION_NAME, chroma_fix_host
9+
from engine.clients.chroma.parser import ChromaConditionParser
10+
11+
12+
class ChromaSearcher(BaseSearcher):
13+
client: ClientAPI = None
14+
parser = ChromaConditionParser()
15+
16+
@classmethod
17+
def init_client(cls, host, distance, connection_params: dict, search_params: dict):
18+
cls.client = HttpClient(
19+
host=chroma_fix_host(host),
20+
settings=Settings(allow_reset=True, anonymized_telemetry=False),
21+
**connection_params,
22+
)
23+
cls.collection = cls.client.get_collection(name=CHROMA_COLLECTION_NAME)
24+
cls.search_params = search_params
25+
26+
@classmethod
27+
def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]:
28+
res = cls.collection.query(
29+
query_embeddings=[query.vector],
30+
n_results=top,
31+
where=cls.parser.parse(query.meta_conditions),
32+
include=[IncludeEnum.distances],
33+
)
34+
35+
return [(int(hit[0]), float(hit[1])) for hit in zip(res["ids"][0], res["distances"][0])]
36+
37+
def setup_search(self):
38+
metadata = self.collection.metadata.copy()
39+
metadata.pop("hnsw:space", None) # Not allowed in the collection.modify method
40+
metadata.update(self.search_params.get('metadata', {}))
41+
self.collection.modify(metadata=metadata)

engine/clients/chroma/upload.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from typing import List
2+
3+
from chromadb import HttpClient, ClientAPI, Settings
4+
5+
from dataset_reader.base_reader import Record
6+
from engine.base_client.upload import BaseUploader
7+
from engine.clients.chroma.config import CHROMA_COLLECTION_NAME, chroma_fix_host
8+
9+
10+
class ChromaUploader(BaseUploader):
11+
client: ClientAPI = None
12+
upload_params = {}
13+
14+
@classmethod
15+
def init_client(cls, host, distance, connection_params, upload_params):
16+
cls.client = HttpClient(
17+
host=chroma_fix_host(host),
18+
settings=Settings(allow_reset=True, anonymized_telemetry=False),
19+
**connection_params,
20+
)
21+
cls.collection = cls.client.get_collection(name=CHROMA_COLLECTION_NAME)
22+
23+
@classmethod
24+
def upload_batch(cls, batch: List[Record]):
25+
ids, vectors, payloads = [], [], []
26+
for point in batch:
27+
ids.append(str(point.id))
28+
vectors.append(point.vector)
29+
payloads.append(point.metadata or None)
30+
31+
cls.collection.add(
32+
embeddings=vectors,
33+
metadatas=payloads or None,
34+
ids=ids,
35+
)

engine/clients/client_factory.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
BaseSearcher,
88
BaseUploader,
99
)
10+
from engine.clients.chroma import ChromaConfigurator, ChromaSearcher, ChromaUploader
1011
from engine.clients.elasticsearch import (
1112
ElasticConfigurator,
1213
ElasticSearcher,
@@ -39,6 +40,7 @@
3940
"opensearch": OpenSearchConfigurator,
4041
"redis": RedisConfigurator,
4142
"pgvector": PgVectorConfigurator,
43+
"chroma": ChromaConfigurator,
4244
}
4345

4446
ENGINE_UPLOADERS = {
@@ -49,6 +51,7 @@
4951
"opensearch": OpenSearchUploader,
5052
"redis": RedisUploader,
5153
"pgvector": PgVectorUploader,
54+
"chroma": ChromaUploader,
5255
}
5356

5457
ENGINE_SEARCHERS = {
@@ -59,6 +62,7 @@
5962
"opensearch": OpenSearchSearcher,
6063
"redis": RedisSearcher,
6164
"pgvector": PgVectorSearcher,
65+
"chroma": ChromaSearcher,
6266
}
6367

6468

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
services:
2+
chromadb_bench:
3+
image: ${CONTAINER_REGISTRY:-docker.io}/chromadb/chroma:0.5.7
4+
#volumes:
5+
# - ./chromadb:/chroma/chroma
6+
ports:
7+
- "8000:8000"
8+
logging:
9+
driver: "json-file"
10+
options:
11+
max-file: 1
12+
max-size: 10m
13+
environment:
14+
IS_PERSISTENT: TRUE
15+
ANONYMIZED_TELEMETRY: False
16+
CHROMA_WORKERS: 2
17+
deploy:
18+
resources:
19+
limits:
20+
memory: 25Gb
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
[
2+
{
3+
"name": "chroma-default",
4+
"engine": "chroma",
5+
"connection_params": {},
6+
"collection_params": {},
7+
"search_params": [
8+
{
9+
"parallel": 8
10+
}
11+
],
12+
"upload_params": {
13+
"parallel": 16,
14+
"batch_size": 1024
15+
}
16+
},
17+
{
18+
"name": "chroma-m-16-ef-128",
19+
"engine": "chroma",
20+
"connection_params": {},
21+
"collection_params": {
22+
"metadata": {
23+
"hnsw:M": 16,
24+
"hnsw:construction_ef": 128
25+
}
26+
},
27+
"search_params": [
28+
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
29+
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
30+
],
31+
"upload_params": { "parallel": 16 }
32+
},
33+
{
34+
"name": "chroma-m-32-ef-128",
35+
"engine": "chroma",
36+
"connection_params": {},
37+
"collection_params": {
38+
"metadata": {
39+
"hnsw:M": 32,
40+
"hnsw:construction_ef": 128
41+
}
42+
},
43+
"search_params": [
44+
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
45+
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
46+
],
47+
"upload_params": { "parallel": 16 }
48+
},
49+
{
50+
"name": "chroma-m-32-ef-256",
51+
"engine": "chroma",
52+
"connection_params": {},
53+
"collection_params": {
54+
"metadata": {
55+
"hnsw:M": 32,
56+
"hnsw:construction_ef": 256
57+
}
58+
},
59+
"search_params": [
60+
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
61+
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
62+
],
63+
"upload_params": { "parallel": 16 }
64+
},
65+
{
66+
"name": "chroma-m-32-ef-512",
67+
"engine": "chroma",
68+
"connection_params": {},
69+
"collection_params": {
70+
"metadata": {
71+
"hnsw:M": 32,
72+
"hnsw:construction_ef": 512
73+
}
74+
},
75+
"search_params": [
76+
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
77+
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
78+
],
79+
"upload_params": { "parallel": 16 }
80+
},
81+
{
82+
"name": "chroma-m-64-ef-256",
83+
"engine": "chroma",
84+
"connection_params": {},
85+
"collection_params": {
86+
"metadata": {
87+
"hnsw:M": 64,
88+
"hnsw:construction_ef": 256
89+
}
90+
},
91+
"search_params": [
92+
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
93+
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
94+
],
95+
"upload_params": { "parallel": 16 }
96+
},
97+
{
98+
"name": "chroma-m-64-ef-512",
99+
"engine": "chroma",
100+
"connection_params": {},
101+
"collection_params": {
102+
"metadata": {
103+
"hnsw:M": 64,
104+
"hnsw:construction_ef": 512
105+
}
106+
},
107+
"search_params": [
108+
{ "parallel": 1, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 1, "metadata": {"hnsw:search_ef": 512 } },
109+
{ "parallel": 100, "metadata": {"hnsw:search_ef": 128 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 256 } }, { "parallel": 100, "metadata": {"hnsw:search_ef": 512 } }
110+
],
111+
"upload_params": { "parallel": 16 }
112+
}
113+
]

0 commit comments

Comments
 (0)