Skip to content

Commit 137d2fd

Browse files
authored
feat: Nomic embedding function (#74)
Closes #73 Original work credit by @andrewblum on the Chroma core project
1 parent ba1493d commit 137d2fd

File tree

8 files changed

+209
-12
lines changed

8 files changed

+209
-12
lines changed

.github/workflows/test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,4 @@ jobs:
8686
CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
8787
CF_GATEWAY_ENDPOINT: ${{ secrets.CF_GATEWAY_ENDPOINT }}
8888
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
89+
NOMIC_API_KEY: ${{ secrets.NOMIC_API_KEY }}

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ pip install chromadbx
1717
- [Mistral AI](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#mistral-ai) embeddings
1818
- [Cloudflare Workers AI](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#cloudflare-workers-ai) embeddings
1919
- [SpaCy](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#spacy) embeddings
20-
- [Together](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#together) embeddings
20+
- [Together](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#together) embeddings.
21+
- [Nomic](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#nomic) embeddings.
2122

2223
## Usage
2324

chromadbx/embeddings/mistral.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,21 @@ def __init__(
3535
)
3636

3737
def __call__(self, input: Documents) -> Embeddings:
38+
"""
39+
Get the embeddings for a list of texts.
40+
41+
Args:
42+
input (Documents): A list of texts to get embeddings for.
43+
44+
Returns:
45+
Embeddings: The embeddings for the texts.
46+
47+
Example:
48+
>>> from chromadbx.embeddings.mistral import MistralAIEmbeddings
49+
>>> ef = MistralAIEmbeddings()
50+
>>> texts = ["Hello, world!", "How are you?"]
51+
>>> embeddings = ef(texts)
52+
"""
3853
embeddings_batch_response = self._client.embeddings.create(
3954
model=self._model,
4055
inputs=input,

chromadbx/embeddings/nomic.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
from enum import Enum
2+
import os
3+
from typing import Optional, cast
4+
5+
from chromadb.api.types import Documents, Embeddings, EmbeddingFunction
6+
7+
8+
class TaskType(str, Enum):
9+
SEARCH_DOCUMENT = "search_document"
10+
SEARCH_QUERY = "search_query"
11+
CLASSIFICATION = "classification"
12+
CLUSTERING = "clustering"
13+
14+
15+
class LongTextMode(str, Enum):
16+
TRUNCATE = "truncate"
17+
MEAN = "mean"
18+
19+
20+
class NomicEmbeddingFunction(EmbeddingFunction[Documents]): # type: ignore[misc]
21+
"""
22+
Nomic Embedding Function using the Nomic Embedding API - https://docs.nomic.ai/atlas/models/text-embedding.
23+
"""
24+
25+
def __init__(
26+
self,
27+
api_key: Optional[str] = None,
28+
model_name: Optional[str] = "nomic-embed-text-v1.5",
29+
*,
30+
dimensionality: Optional[int] = 768,
31+
max_tokens_per_text: Optional[int] = 8192,
32+
long_text_mode: Optional[LongTextMode] = LongTextMode.TRUNCATE,
33+
task_type: Optional[TaskType] = TaskType.SEARCH_DOCUMENT,
34+
) -> None:
35+
"""
36+
Initialize the Nomic Embedding Function.
37+
38+
Read more about the Nomic Embedding API here: https://docs.nomic.ai/reference/api/embed-text-v-1-embedding-text-post#request
39+
40+
Args:
41+
api_key (str): The API key to use for the Nomic Embedding API.
42+
model_name (str): The name of the model to use for text embeddings. E.g. "nomic-embed-text-v1.5" (see https://docs.nomic.ai/atlas/models/text-embedding for available models).
43+
dimensionality (int): The dimensionality of the embeddings. E.g. 768 for "nomic-embed-text-v1.5".
44+
max_tokens_per_text (int): The maximum number of tokens per text. E.g. 8192 for "nomic-embed-text-v1.5".
45+
long_text_mode (str): The mode to use for long texts. E.g. "truncate" or "mean".
46+
task_type (str): The task type to use for the Nomic Embedding API. E.g. "search_document", "search_query", "classification", and "clustering".
47+
"""
48+
try:
49+
import httpx
50+
except ImportError:
51+
raise ValueError(
52+
"The httpx python package is not installed. Please install it with `pip install httpx`"
53+
)
54+
55+
if not api_key and os.getenv("NOMIC_API_KEY") is None:
56+
raise ValueError(
57+
"No Nomic API key provided or NOMIC_API_KEY environment variable is not set"
58+
)
59+
if not api_key:
60+
api_key = os.getenv("NOMIC_API_KEY")
61+
62+
self._api_url = "https://api-atlas.nomic.ai/v1/embedding/text"
63+
self._model_name = model_name
64+
self._task_type = task_type
65+
self._dimensionality = dimensionality
66+
self._long_text_mode = long_text_mode
67+
self._max_tokens_per_text = max_tokens_per_text
68+
self._client = httpx.Client()
69+
self._client.headers.update(
70+
{
71+
"Content-Type": "application/json",
72+
"Authorization": f"Bearer {api_key}",
73+
}
74+
)
75+
76+
def __call__(self, input: Documents) -> Embeddings:
77+
"""
78+
Get the embeddings for a list of texts.
79+
80+
Args:
81+
input (Documents): A list of texts to get embeddings for.
82+
83+
Returns:
84+
Embeddings: The embeddings for the texts.
85+
86+
Example:
87+
>>> from chromadbx.embeddings.nomic import NomicEmbeddingFunction
88+
>>> nomic_ef = NomicEmbeddingFunction(model_name="nomic-embed-text-v1.5")
89+
>>> texts = ["Hello, world!", "How are you?"]
90+
>>> embeddings = nomic_ef(texts)
91+
"""
92+
texts = input if isinstance(input, list) else [input]
93+
94+
response = self._client.post(
95+
self._api_url,
96+
json={
97+
"model": self._model_name,
98+
"texts": texts,
99+
"task_type": self._task_type.value if self._task_type else None,
100+
"dimensionality": self._dimensionality,
101+
"long_text_mode": self._long_text_mode.value
102+
if self._long_text_mode
103+
else None,
104+
"max_tokens_per_text": self._max_tokens_per_text,
105+
},
106+
)
107+
response.raise_for_status()
108+
response_json = response.json()
109+
if "embeddings" not in response_json:
110+
raise RuntimeError("Nomic API did not return embeddings")
111+
112+
return cast(Embeddings, response_json["embeddings"])

chromadbx/embeddings/together.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,10 @@ def __call__(self, input: Documents) -> Embeddings:
4444
Embeddings: The embeddings for the texts.
4545
4646
Example:
47-
```python
48-
import os
49-
from chromadbx.embeddings.together import TogetherEmbeddingFunction
50-
51-
ef = TogetherEmbeddingFunction(api_key=os.getenv("TOGETHER_API_KEY"))
52-
embeddings = ef(["hello world", "goodbye world"])
53-
```
47+
>>> import os
48+
>>> from chromadbx.embeddings.together import TogetherEmbeddingFunction
49+
>>> ef = TogetherEmbeddingFunction(api_key=os.getenv("TOGETHER_API_KEY"))
50+
>>> embeddings = ef(["hello world", "goodbye world"])
5451
"""
5552
outputs = self.client.embeddings.create(input=input, model=self.model_name)
5653
return cast(Embeddings, [outputs.data[i].embedding for i in range(len(input))])

docs/embeddings.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,3 +253,33 @@ col = client.get_or_create_collection("test", embedding_function=ef)
253253

254254
col.add(ids=["id1", "id2", "id3"], documents=["lorem ipsum...", "doc2", "doc3"])
255255
```
256+
257+
258+
## Nomic
259+
260+
A convenient way to generate embeddings using Nomic models.
261+
262+
To use the embedding function, you need to install the `nomic` package.
263+
264+
```bash
265+
pip install nomic
266+
```
267+
268+
Before you proceed, you will need to create an account and get an API key from [Nomic](https://atlas.nomic.ai).
269+
270+
```py
271+
import os
272+
import chromadb
273+
from chromadbx.embeddings.nomic import NomicEmbeddingFunction
274+
275+
ef = NomicEmbeddingFunction(api_key=os.getenv("NOMIC_API_KEY"))
276+
277+
client = chromadb.Client()
278+
279+
col = client.get_or_create_collection("test", embedding_function=ef)
280+
281+
col.add(ids=["id1", "id2", "id3"], documents=["lorem ipsum...", "doc2", "doc3"])
282+
```
283+
284+
> [!TIP]
285+
> Nomic supports dimensionality reduction which can save storage space required in Chroma without degrading retrieval quality. To take advantage of this use `dimensionality` parameter in the `NomicEmbeddingFunction` class.

pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,14 @@ packages = [{ include = "chromadbx" }]
1515
[tool.poetry.dependencies]
1616
python = ">=3.9,<3.13"
1717
pydantic = "^2.7.2"
18-
chromadb = { version = ">=0.4.0,<=0.6.0", optional = true }
18+
httpx = "^0.27.2"
19+
chromadb = { version = ">=0.4.0,<0.7.0", optional = true }
1920
ulid-py = { version = "^1.1.0", optional = true }
2021
nanoid = { version = "^2.0.0", optional = true }
2122
llama-embedder = { version = "^0.0.7", optional = true }
2223

2324
[tool.poetry.group.dev.dependencies]
24-
chromadb = { version = ">=0.4.0,<=0.6.0" }
25+
chromadb = { version = ">=0.4.0,<0.7.0" }
2526
pytest = "^8.2.1"
2627
black = "24.3.0"
2728
pre-commit = "^3.6.0"
@@ -34,10 +35,9 @@ mistralai = "^1.1.0"
3435
spacy = "^3.8.4"
3536
together = "^1.3.11"
3637

37-
3838
[tool.poetry.extras]
3939
ids = ["ulid-py", "nanoid"]
40-
embeddings = ["llama-embedder", "onnxruntime"]
40+
embeddings = ["llama-embedder", "onnxruntime", "huggingface_hub", "mistralai", "spacy", "together", "vertexai"]
4141
core = ["chromadb"]
4242

4343
[build-system]

test/embeddings/test_nomic.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import os
2+
import pytest
3+
from chromadbx.embeddings.nomic import NomicEmbeddingFunction
4+
5+
httpx = pytest.importorskip("httpx", reason="nomic not installed")
6+
7+
8+
@pytest.mark.skipif(
9+
os.getenv("NOMIC_API_KEY") is None,
10+
reason="NOMIC_API_KEY environment variable is not set",
11+
)
12+
def test_nomic() -> None:
13+
ef = NomicEmbeddingFunction()
14+
embeddings = ef(["hello world", "goodbye world"])
15+
assert len(embeddings) == 2
16+
assert len(embeddings[0]) == 768
17+
assert len(embeddings[1]) == 768
18+
19+
20+
@pytest.mark.skipif(
21+
os.getenv("NOMIC_API_KEY") is None,
22+
reason="NOMIC_API_KEY environment variable is not set",
23+
)
24+
def test_nomic_with_api_key() -> None:
25+
ef = NomicEmbeddingFunction(api_key=os.getenv("NOMIC_API_KEY"))
26+
embeddings = ef(["hello world", "goodbye world"])
27+
assert len(embeddings) == 2
28+
assert len(embeddings[0]) == 768
29+
assert len(embeddings[1]) == 768
30+
31+
32+
@pytest.mark.skipif(
33+
os.getenv("NOMIC_API_KEY") is None,
34+
reason="NOMIC_API_KEY environment variable is not set",
35+
)
36+
def test_dimensionality() -> None:
37+
ef = NomicEmbeddingFunction(dimensionality=512)
38+
embeddings = ef(["hello world", "goodbye world"])
39+
assert len(embeddings) == 2
40+
assert len(embeddings[0]) == 512
41+
assert len(embeddings[1]) == 512

0 commit comments

Comments
 (0)