Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,4 @@ jobs:
CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
CF_GATEWAY_ENDPOINT: ${{ secrets.CF_GATEWAY_ENDPOINT }}
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
NOMIC_API_KEY: ${{ secrets.NOMIC_API_KEY }}
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ pip install chromadbx
- [Mistral AI](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#mistral-ai) embeddings
- [Cloudflare Workers AI](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#cloudflare-workers-ai) embeddings
- [SpaCy](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#spacy) embeddings
- [Together](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#together) embeddings
- [Together](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#together) embeddings.
- [Nomic](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#nomic) embeddings.

## Usage

Expand Down
15 changes: 15 additions & 0 deletions chromadbx/embeddings/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,21 @@ def __init__(
)

def __call__(self, input: Documents) -> Embeddings:
"""
Get the embeddings for a list of texts.

Args:
input (Documents): A list of texts to get embeddings for.

Returns:
Embeddings: The embeddings for the texts.

Example:
>>> from chromadbx.embeddings.mistral import MistralAIEmbeddings
>>> ef = MistralAIEmbeddings()
>>> texts = ["Hello, world!", "How are you?"]
>>> embeddings = ef(texts)
"""
embeddings_batch_response = self._client.embeddings.create(
model=self._model,
inputs=input,
Expand Down
112 changes: 112 additions & 0 deletions chromadbx/embeddings/nomic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from enum import Enum
import os
from typing import Optional, cast

from chromadb.api.types import Documents, Embeddings, EmbeddingFunction


class TaskType(str, Enum):
SEARCH_DOCUMENT = "search_document"
SEARCH_QUERY = "search_query"
CLASSIFICATION = "classification"
CLUSTERING = "clustering"


class LongTextMode(str, Enum):
TRUNCATE = "truncate"
MEAN = "mean"


class NomicEmbeddingFunction(EmbeddingFunction[Documents]): # type: ignore[misc]
"""
Nomic Embedding Function using the Nomic Embedding API - https://docs.nomic.ai/atlas/models/text-embedding.
"""

def __init__(
self,
api_key: Optional[str] = None,
model_name: Optional[str] = "nomic-embed-text-v1.5",
*,
dimensionality: Optional[int] = 768,
max_tokens_per_text: Optional[int] = 8192,
long_text_mode: Optional[LongTextMode] = LongTextMode.TRUNCATE,
task_type: Optional[TaskType] = TaskType.SEARCH_DOCUMENT,
) -> None:
"""
Initialize the Nomic Embedding Function.

Read more about the Nomic Embedding API here: https://docs.nomic.ai/reference/api/embed-text-v-1-embedding-text-post#request

Args:
api_key (str): The API key to use for the Nomic Embedding API.
model_name (str): The name of the model to use for text embeddings. E.g. "nomic-embed-text-v1.5" (see https://docs.nomic.ai/atlas/models/text-embedding for available models).
dimensionality (int): The dimensionality of the embeddings. E.g. 768 for "nomic-embed-text-v1.5".
max_tokens_per_text (int): The maximum number of tokens per text. E.g. 8192 for "nomic-embed-text-v1.5".
long_text_mode (str): The mode to use for long texts. E.g. "truncate" or "mean".
task_type (str): The task type to use for the Nomic Embedding API. E.g. "search_document", "search_query", "classification", and "clustering".
"""
try:
import httpx
except ImportError:
raise ValueError(
"The httpx python package is not installed. Please install it with `pip install httpx`"
)

if not api_key and os.getenv("NOMIC_API_KEY") is None:
raise ValueError(
"No Nomic API key provided or NOMIC_API_KEY environment variable is not set"
)
if not api_key:
api_key = os.getenv("NOMIC_API_KEY")

self._api_url = "https://api-atlas.nomic.ai/v1/embedding/text"
self._model_name = model_name
self._task_type = task_type
self._dimensionality = dimensionality
self._long_text_mode = long_text_mode
self._max_tokens_per_text = max_tokens_per_text
self._client = httpx.Client()
self._client.headers.update(
{
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
}
)

def __call__(self, input: Documents) -> Embeddings:
"""
Get the embeddings for a list of texts.

Args:
input (Documents): A list of texts to get embeddings for.

Returns:
Embeddings: The embeddings for the texts.

Example:
>>> from chromadbx.embeddings.nomic import NomicEmbeddingFunction
>>> nomic_ef = NomicEmbeddingFunction(model_name="nomic-embed-text-v1.5")
>>> texts = ["Hello, world!", "How are you?"]
>>> embeddings = nomic_ef(texts)
"""
texts = input if isinstance(input, list) else [input]

response = self._client.post(
self._api_url,
json={
"model": self._model_name,
"texts": texts,
"task_type": self._task_type.value if self._task_type else None,
"dimensionality": self._dimensionality,
"long_text_mode": self._long_text_mode.value
if self._long_text_mode
else None,
"max_tokens_per_text": self._max_tokens_per_text,
},
)
response.raise_for_status()
response_json = response.json()
if "embeddings" not in response_json:
raise RuntimeError("Nomic API did not return embeddings")

return cast(Embeddings, response_json["embeddings"])
11 changes: 4 additions & 7 deletions chromadbx/embeddings/together.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,10 @@ def __call__(self, input: Documents) -> Embeddings:
Embeddings: The embeddings for the texts.

Example:
```python
import os
from chromadbx.embeddings.together import TogetherEmbeddingFunction

ef = TogetherEmbeddingFunction(api_key=os.getenv("TOGETHER_API_KEY"))
embeddings = ef(["hello world", "goodbye world"])
```
>>> import os
>>> from chromadbx.embeddings.together import TogetherEmbeddingFunction
>>> ef = TogetherEmbeddingFunction(api_key=os.getenv("TOGETHER_API_KEY"))
>>> embeddings = ef(["hello world", "goodbye world"])
"""
outputs = self.client.embeddings.create(input=input, model=self.model_name)
return cast(Embeddings, [outputs.data[i].embedding for i in range(len(input))])
30 changes: 30 additions & 0 deletions docs/embeddings.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,33 @@ col = client.get_or_create_collection("test", embedding_function=ef)

col.add(ids=["id1", "id2", "id3"], documents=["lorem ipsum...", "doc2", "doc3"])
```


## Nomic

A convenient way to generate embeddings using Nomic models.

To use the embedding function, you need to install the `nomic` package.

```bash
pip install nomic
```

Before you proceed, you will need to create an account and get an API key from [Nomic](https://atlas.nomic.ai).

```py
import os
import chromadb
from chromadbx.embeddings.nomic import NomicEmbeddingFunction

ef = NomicEmbeddingFunction(api_key=os.getenv("NOMIC_API_KEY"))

client = chromadb.Client()

col = client.get_or_create_collection("test", embedding_function=ef)

col.add(ids=["id1", "id2", "id3"], documents=["lorem ipsum...", "doc2", "doc3"])
```

> [!TIP]
> Nomic supports dimensionality reduction which can save storage space required in Chroma without degrading retrieval quality. To take advantage of this use `dimensionality` parameter in the `NomicEmbeddingFunction` class.
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@ packages = [{ include = "chromadbx" }]
[tool.poetry.dependencies]
python = ">=3.9,<3.13"
pydantic = "^2.7.2"
chromadb = { version = ">=0.4.0,<=0.6.0", optional = true }
httpx = "^0.27.2"
chromadb = { version = ">=0.4.0,<0.7.0", optional = true }
ulid-py = { version = "^1.1.0", optional = true }
nanoid = { version = "^2.0.0", optional = true }
llama-embedder = { version = "^0.0.7", optional = true }

[tool.poetry.group.dev.dependencies]
chromadb = { version = ">=0.4.0,<=0.6.0" }
chromadb = { version = ">=0.4.0,<0.7.0" }
pytest = "^8.2.1"
black = "24.3.0"
pre-commit = "^3.6.0"
Expand All @@ -34,10 +35,9 @@ mistralai = "^1.1.0"
spacy = "^3.8.4"
together = "^1.3.11"


[tool.poetry.extras]
ids = ["ulid-py", "nanoid"]
embeddings = ["llama-embedder", "onnxruntime"]
embeddings = ["llama-embedder", "onnxruntime", "huggingface_hub", "mistralai", "spacy", "together", "vertexai"]
core = ["chromadb"]

[build-system]
Expand Down
41 changes: 41 additions & 0 deletions test/embeddings/test_nomic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import pytest
from chromadbx.embeddings.nomic import NomicEmbeddingFunction

httpx = pytest.importorskip("httpx", reason="nomic not installed")


@pytest.mark.skipif(
os.getenv("NOMIC_API_KEY") is None,
reason="NOMIC_API_KEY environment variable is not set",
)
def test_nomic() -> None:
ef = NomicEmbeddingFunction()
embeddings = ef(["hello world", "goodbye world"])
assert len(embeddings) == 2
assert len(embeddings[0]) == 768
assert len(embeddings[1]) == 768


@pytest.mark.skipif(
os.getenv("NOMIC_API_KEY") is None,
reason="NOMIC_API_KEY environment variable is not set",
)
def test_nomic_with_api_key() -> None:
ef = NomicEmbeddingFunction(api_key=os.getenv("NOMIC_API_KEY"))
embeddings = ef(["hello world", "goodbye world"])
assert len(embeddings) == 2
assert len(embeddings[0]) == 768
assert len(embeddings[1]) == 768


@pytest.mark.skipif(
os.getenv("NOMIC_API_KEY") is None,
reason="NOMIC_API_KEY environment variable is not set",
)
def test_dimensionality() -> None:
ef = NomicEmbeddingFunction(dimensionality=512)
embeddings = ef(["hello world", "goodbye world"])
assert len(embeddings) == 2
assert len(embeddings[0]) == 512
assert len(embeddings[1]) == 512
Loading