diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 91aa267..de3814a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -86,3 +86,4 @@ jobs: CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} CF_GATEWAY_ENDPOINT: ${{ secrets.CF_GATEWAY_ENDPOINT }} TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + NOMIC_API_KEY: ${{ secrets.NOMIC_API_KEY }} diff --git a/README.md b/README.md index 76736df..8b0c001 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,8 @@ pip install chromadbx - [Mistral AI](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#mistral-ai) embeddings - [Cloudflare Workers AI](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#cloudflare-workers-ai) embeddings - [SpaCy](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#spacy) embeddings - - [Together](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#together) embeddings + - [Together](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#together) embeddings. + - [Nomic](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#nomic) embeddings. ## Usage diff --git a/chromadbx/embeddings/mistral.py b/chromadbx/embeddings/mistral.py index f315474..6dd4f77 100644 --- a/chromadbx/embeddings/mistral.py +++ b/chromadbx/embeddings/mistral.py @@ -35,6 +35,21 @@ def __init__( ) def __call__(self, input: Documents) -> Embeddings: + """ + Get the embeddings for a list of texts. + + Args: + input (Documents): A list of texts to get embeddings for. + + Returns: + Embeddings: The embeddings for the texts. + + Example: + >>> from chromadbx.embeddings.mistral import MistralAIEmbeddings + >>> ef = MistralAIEmbeddings() + >>> texts = ["Hello, world!", "How are you?"] + >>> embeddings = ef(texts) + """ embeddings_batch_response = self._client.embeddings.create( model=self._model, inputs=input, diff --git a/chromadbx/embeddings/nomic.py b/chromadbx/embeddings/nomic.py new file mode 100644 index 0000000..19ccb20 --- /dev/null +++ b/chromadbx/embeddings/nomic.py @@ -0,0 +1,112 @@ +from enum import Enum +import os +from typing import Optional, cast + +from chromadb.api.types import Documents, Embeddings, EmbeddingFunction + + +class TaskType(str, Enum): + SEARCH_DOCUMENT = "search_document" + SEARCH_QUERY = "search_query" + CLASSIFICATION = "classification" + CLUSTERING = "clustering" + + +class LongTextMode(str, Enum): + TRUNCATE = "truncate" + MEAN = "mean" + + +class NomicEmbeddingFunction(EmbeddingFunction[Documents]): # type: ignore[misc] + """ + Nomic Embedding Function using the Nomic Embedding API - https://docs.nomic.ai/atlas/models/text-embedding. + """ + + def __init__( + self, + api_key: Optional[str] = None, + model_name: Optional[str] = "nomic-embed-text-v1.5", + *, + dimensionality: Optional[int] = 768, + max_tokens_per_text: Optional[int] = 8192, + long_text_mode: Optional[LongTextMode] = LongTextMode.TRUNCATE, + task_type: Optional[TaskType] = TaskType.SEARCH_DOCUMENT, + ) -> None: + """ + Initialize the Nomic Embedding Function. + + Read more about the Nomic Embedding API here: https://docs.nomic.ai/reference/api/embed-text-v-1-embedding-text-post#request + + Args: + api_key (str): The API key to use for the Nomic Embedding API. + model_name (str): The name of the model to use for text embeddings. E.g. "nomic-embed-text-v1.5" (see https://docs.nomic.ai/atlas/models/text-embedding for available models). + dimensionality (int): The dimensionality of the embeddings. E.g. 768 for "nomic-embed-text-v1.5". + max_tokens_per_text (int): The maximum number of tokens per text. E.g. 8192 for "nomic-embed-text-v1.5". + long_text_mode (str): The mode to use for long texts. E.g. "truncate" or "mean". + task_type (str): The task type to use for the Nomic Embedding API. E.g. "search_document", "search_query", "classification", and "clustering". + """ + try: + import httpx + except ImportError: + raise ValueError( + "The httpx python package is not installed. Please install it with `pip install httpx`" + ) + + if not api_key and os.getenv("NOMIC_API_KEY") is None: + raise ValueError( + "No Nomic API key provided or NOMIC_API_KEY environment variable is not set" + ) + if not api_key: + api_key = os.getenv("NOMIC_API_KEY") + + self._api_url = "https://api-atlas.nomic.ai/v1/embedding/text" + self._model_name = model_name + self._task_type = task_type + self._dimensionality = dimensionality + self._long_text_mode = long_text_mode + self._max_tokens_per_text = max_tokens_per_text + self._client = httpx.Client() + self._client.headers.update( + { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + } + ) + + def __call__(self, input: Documents) -> Embeddings: + """ + Get the embeddings for a list of texts. + + Args: + input (Documents): A list of texts to get embeddings for. + + Returns: + Embeddings: The embeddings for the texts. + + Example: + >>> from chromadbx.embeddings.nomic import NomicEmbeddingFunction + >>> nomic_ef = NomicEmbeddingFunction(model_name="nomic-embed-text-v1.5") + >>> texts = ["Hello, world!", "How are you?"] + >>> embeddings = nomic_ef(texts) + """ + texts = input if isinstance(input, list) else [input] + + response = self._client.post( + self._api_url, + json={ + "model": self._model_name, + "texts": texts, + "task_type": self._task_type.value if self._task_type else None, + "dimensionality": self._dimensionality, + "long_text_mode": self._long_text_mode.value + if self._long_text_mode + else None, + "max_tokens_per_text": self._max_tokens_per_text, + }, + ) + response.raise_for_status() + response_json = response.json() + if "embeddings" not in response_json: + raise RuntimeError("Nomic API did not return embeddings") + + return cast(Embeddings, response_json["embeddings"]) diff --git a/chromadbx/embeddings/together.py b/chromadbx/embeddings/together.py index 85f21a1..68318eb 100644 --- a/chromadbx/embeddings/together.py +++ b/chromadbx/embeddings/together.py @@ -44,13 +44,10 @@ def __call__(self, input: Documents) -> Embeddings: Embeddings: The embeddings for the texts. Example: - ```python - import os - from chromadbx.embeddings.together import TogetherEmbeddingFunction - - ef = TogetherEmbeddingFunction(api_key=os.getenv("TOGETHER_API_KEY")) - embeddings = ef(["hello world", "goodbye world"]) - ``` + >>> import os + >>> from chromadbx.embeddings.together import TogetherEmbeddingFunction + >>> ef = TogetherEmbeddingFunction(api_key=os.getenv("TOGETHER_API_KEY")) + >>> embeddings = ef(["hello world", "goodbye world"]) """ outputs = self.client.embeddings.create(input=input, model=self.model_name) return cast(Embeddings, [outputs.data[i].embedding for i in range(len(input))]) diff --git a/docs/embeddings.md b/docs/embeddings.md index 34351e7..e3e8f15 100644 --- a/docs/embeddings.md +++ b/docs/embeddings.md @@ -253,3 +253,33 @@ col = client.get_or_create_collection("test", embedding_function=ef) col.add(ids=["id1", "id2", "id3"], documents=["lorem ipsum...", "doc2", "doc3"]) ``` + + +## Nomic + +A convenient way to generate embeddings using Nomic models. + +To use the embedding function, you need to install the `nomic` package. + +```bash +pip install nomic +``` + +Before you proceed, you will need to create an account and get an API key from [Nomic](https://atlas.nomic.ai). + +```py +import os +import chromadb +from chromadbx.embeddings.nomic import NomicEmbeddingFunction + +ef = NomicEmbeddingFunction(api_key=os.getenv("NOMIC_API_KEY")) + +client = chromadb.Client() + +col = client.get_or_create_collection("test", embedding_function=ef) + +col.add(ids=["id1", "id2", "id3"], documents=["lorem ipsum...", "doc2", "doc3"]) +``` + +> [!TIP] +> Nomic supports dimensionality reduction which can save storage space required in Chroma without degrading retrieval quality. To take advantage of this use `dimensionality` parameter in the `NomicEmbeddingFunction` class. diff --git a/pyproject.toml b/pyproject.toml index f8090bd..6690d6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,13 +15,14 @@ packages = [{ include = "chromadbx" }] [tool.poetry.dependencies] python = ">=3.9,<3.13" pydantic = "^2.7.2" -chromadb = { version = ">=0.4.0,<=0.6.0", optional = true } +httpx = "^0.27.2" +chromadb = { version = ">=0.4.0,<0.7.0", optional = true } ulid-py = { version = "^1.1.0", optional = true } nanoid = { version = "^2.0.0", optional = true } llama-embedder = { version = "^0.0.7", optional = true } [tool.poetry.group.dev.dependencies] -chromadb = { version = ">=0.4.0,<=0.6.0" } +chromadb = { version = ">=0.4.0,<0.7.0" } pytest = "^8.2.1" black = "24.3.0" pre-commit = "^3.6.0" @@ -34,10 +35,9 @@ mistralai = "^1.1.0" spacy = "^3.8.4" together = "^1.3.11" - [tool.poetry.extras] ids = ["ulid-py", "nanoid"] -embeddings = ["llama-embedder", "onnxruntime"] +embeddings = ["llama-embedder", "onnxruntime", "huggingface_hub", "mistralai", "spacy", "together", "vertexai"] core = ["chromadb"] [build-system] diff --git a/test/embeddings/test_nomic.py b/test/embeddings/test_nomic.py new file mode 100644 index 0000000..ea29994 --- /dev/null +++ b/test/embeddings/test_nomic.py @@ -0,0 +1,41 @@ +import os +import pytest +from chromadbx.embeddings.nomic import NomicEmbeddingFunction + +httpx = pytest.importorskip("httpx", reason="nomic not installed") + + +@pytest.mark.skipif( + os.getenv("NOMIC_API_KEY") is None, + reason="NOMIC_API_KEY environment variable is not set", +) +def test_nomic() -> None: + ef = NomicEmbeddingFunction() + embeddings = ef(["hello world", "goodbye world"]) + assert len(embeddings) == 2 + assert len(embeddings[0]) == 768 + assert len(embeddings[1]) == 768 + + +@pytest.mark.skipif( + os.getenv("NOMIC_API_KEY") is None, + reason="NOMIC_API_KEY environment variable is not set", +) +def test_nomic_with_api_key() -> None: + ef = NomicEmbeddingFunction(api_key=os.getenv("NOMIC_API_KEY")) + embeddings = ef(["hello world", "goodbye world"]) + assert len(embeddings) == 2 + assert len(embeddings[0]) == 768 + assert len(embeddings[1]) == 768 + + +@pytest.mark.skipif( + os.getenv("NOMIC_API_KEY") is None, + reason="NOMIC_API_KEY environment variable is not set", +) +def test_dimensionality() -> None: + ef = NomicEmbeddingFunction(dimensionality=512) + embeddings = ef(["hello world", "goodbye world"]) + assert len(embeddings) == 2 + assert len(embeddings[0]) == 512 + assert len(embeddings[1]) == 512