amikos-tech · tazarov · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
@@ -86,3 +86,4 @@ jobs:
           CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
           CF_GATEWAY_ENDPOINT: ${{ secrets.CF_GATEWAY_ENDPOINT }}
           TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+          NOMIC_API_KEY: ${{ secrets.NOMIC_API_KEY }}
@@ -17,7 +17,8 @@ pip install chromadbx
     - [Mistral AI](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#mistral-ai) embeddings
     - [Cloudflare Workers AI](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#cloudflare-workers-ai) embeddings
     - [SpaCy](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#spacy) embeddings
-    - [Together](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#together) embeddings
+    - [Together](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#together) embeddings.
+    - [Nomic](https://github.com/amikos-tech/chromadbx/blob/main/docs/embeddings.md#nomic) embeddings.
 
 ## Usage
 

@@ -35,6 +35,21 @@ def __init__(
             )
 
     def __call__(self, input: Documents) -> Embeddings:
+        """
+        Get the embeddings for a list of texts.
+
+        Args:
+            input (Documents): A list of texts to get embeddings for.
+
+        Returns:
+            Embeddings: The embeddings for the texts.
+
+        Example:
+            >>> from chromadbx.embeddings.mistral import MistralAIEmbeddings
+            >>> ef = MistralAIEmbeddings()
+            >>> texts = ["Hello, world!", "How are you?"]
+            >>> embeddings = ef(texts)
+        """
         embeddings_batch_response = self._client.embeddings.create(
             model=self._model,
             inputs=input,

@@ -0,0 +1,112 @@
+from enum import Enum
+import os
+from typing import Optional, cast
+
+from chromadb.api.types import Documents, Embeddings, EmbeddingFunction
+
+
+class TaskType(str, Enum):
+    SEARCH_DOCUMENT = "search_document"
+    SEARCH_QUERY = "search_query"
+    CLASSIFICATION = "classification"
+    CLUSTERING = "clustering"
+
+
+class LongTextMode(str, Enum):
+    TRUNCATE = "truncate"
+    MEAN = "mean"
+
+
+class NomicEmbeddingFunction(EmbeddingFunction[Documents]):  # type: ignore[misc]
+    """
+    Nomic Embedding Function using the Nomic Embedding API - https://docs.nomic.ai/atlas/models/text-embedding.
+    """
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model_name: Optional[str] = "nomic-embed-text-v1.5",
+        *,
+        dimensionality: Optional[int] = 768,
+        max_tokens_per_text: Optional[int] = 8192,
+        long_text_mode: Optional[LongTextMode] = LongTextMode.TRUNCATE,
+        task_type: Optional[TaskType] = TaskType.SEARCH_DOCUMENT,
+    ) -> None:
+        """
+        Initialize the Nomic Embedding Function.
+
+        Read more about the Nomic Embedding API here: https://docs.nomic.ai/reference/api/embed-text-v-1-embedding-text-post#request
+
+        Args:
+            api_key (str): The API key to use for the Nomic Embedding API.
+            model_name (str): The name of the model to use for text embeddings. E.g. "nomic-embed-text-v1.5" (see https://docs.nomic.ai/atlas/models/text-embedding for available models).
+            dimensionality (int): The dimensionality of the embeddings. E.g. 768 for "nomic-embed-text-v1.5".
+            max_tokens_per_text (int): The maximum number of tokens per text. E.g. 8192 for "nomic-embed-text-v1.5".
+            long_text_mode (str): The mode to use for long texts. E.g. "truncate" or "mean".
+            task_type (str): The task type to use for the Nomic Embedding API. E.g. "search_document", "search_query", "classification", and "clustering".
+        """
+        try:
+            import httpx
+        except ImportError:
+            raise ValueError(
+                "The httpx python package is not installed. Please install it with `pip install httpx`"
+            )
+
+        if not api_key and os.getenv("NOMIC_API_KEY") is None:
+            raise ValueError(
+                "No Nomic API key provided or NOMIC_API_KEY environment variable is not set"
+            )
+        if not api_key:
+            api_key = os.getenv("NOMIC_API_KEY")
+
+        self._api_url = "https://api-atlas.nomic.ai/v1/embedding/text"
+        self._model_name = model_name
+        self._task_type = task_type
+        self._dimensionality = dimensionality
+        self._long_text_mode = long_text_mode
+        self._max_tokens_per_text = max_tokens_per_text
+        self._client = httpx.Client()
+        self._client.headers.update(
+            {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {api_key}",
+            }
+        )
+
+    def __call__(self, input: Documents) -> Embeddings:
+        """
+        Get the embeddings for a list of texts.
+
+        Args:
+            input (Documents): A list of texts to get embeddings for.
+
+        Returns:
+            Embeddings: The embeddings for the texts.
+
+        Example:
+            >>> from chromadbx.embeddings.nomic import NomicEmbeddingFunction
+            >>> nomic_ef = NomicEmbeddingFunction(model_name="nomic-embed-text-v1.5")
+            >>> texts = ["Hello, world!", "How are you?"]
+            >>> embeddings = nomic_ef(texts)
+        """
+        texts = input if isinstance(input, list) else [input]
+
+        response = self._client.post(
+            self._api_url,
+            json={
+                "model": self._model_name,
+                "texts": texts,
+                "task_type": self._task_type.value if self._task_type else None,
+                "dimensionality": self._dimensionality,
+                "long_text_mode": self._long_text_mode.value
+                if self._long_text_mode
+                else None,
+                "max_tokens_per_text": self._max_tokens_per_text,
+            },
+        )
+        response.raise_for_status()
+        response_json = response.json()
+        if "embeddings" not in response_json:
+            raise RuntimeError("Nomic API did not return embeddings")
+
+        return cast(Embeddings, response_json["embeddings"])
@@ -44,13 +44,10 @@ def __call__(self, input: Documents) -> Embeddings:
             Embeddings: The embeddings for the texts.
 
         Example:
-        ```python
-        import os
-        from chromadbx.embeddings.together import TogetherEmbeddingFunction
-
-        ef = TogetherEmbeddingFunction(api_key=os.getenv("TOGETHER_API_KEY"))
-        embeddings = ef(["hello world", "goodbye world"])
-        ```
+            >>> import os
+            >>> from chromadbx.embeddings.together import TogetherEmbeddingFunction
+            >>> ef = TogetherEmbeddingFunction(api_key=os.getenv("TOGETHER_API_KEY"))
+            >>> embeddings = ef(["hello world", "goodbye world"])
         """
         outputs = self.client.embeddings.create(input=input, model=self.model_name)
         return cast(Embeddings, [outputs.data[i].embedding for i in range(len(input))])
@@ -253,3 +253,33 @@ col = client.get_or_create_collection("test", embedding_function=ef)
 
 col.add(ids=["id1", "id2", "id3"], documents=["lorem ipsum...", "doc2", "doc3"])
 ```
+
+
+## Nomic
+
+A convenient way to generate embeddings using Nomic models.
+
+To use the embedding function, you need to install the `nomic` package.
+
+```bash
+pip install nomic
+```
+
+Before you proceed, you will need to create an account and get an API key from [Nomic](https://atlas.nomic.ai).
+
+```py
+import os
+import chromadb
+from chromadbx.embeddings.nomic import NomicEmbeddingFunction
+
+ef = NomicEmbeddingFunction(api_key=os.getenv("NOMIC_API_KEY"))
+
+client = chromadb.Client()
+
+col = client.get_or_create_collection("test", embedding_function=ef)
+
+col.add(ids=["id1", "id2", "id3"], documents=["lorem ipsum...", "doc2", "doc3"])
+```
+
+> [!TIP]
+> Nomic supports dimensionality reduction which can save storage space required in Chroma without degrading retrieval quality. To take advantage of this use `dimensionality` parameter in the `NomicEmbeddingFunction` class.
@@ -15,13 +15,14 @@ packages = [{ include = "chromadbx" }]
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
 pydantic = "^2.7.2"
-chromadb = { version = ">=0.4.0,<=0.6.0", optional = true }
+httpx = "^0.27.2"
+chromadb = { version = ">=0.4.0,<0.7.0", optional = true }
 ulid-py = { version = "^1.1.0", optional = true }
 nanoid = { version = "^2.0.0", optional = true }
 llama-embedder = { version = "^0.0.7", optional = true }
 
 [tool.poetry.group.dev.dependencies]
-chromadb = { version = ">=0.4.0,<=0.6.0" }
+chromadb = { version = ">=0.4.0,<0.7.0" }
 pytest = "^8.2.1"
 black = "24.3.0"
 pre-commit = "^3.6.0"
@@ -34,10 +35,9 @@ mistralai = "^1.1.0"
 spacy = "^3.8.4"
 together = "^1.3.11"
 
-
 [tool.poetry.extras]
 ids = ["ulid-py", "nanoid"]
-embeddings = ["llama-embedder", "onnxruntime"]
+embeddings = ["llama-embedder", "onnxruntime", "huggingface_hub", "mistralai", "spacy", "together", "vertexai"]
 core = ["chromadb"]
 
 [build-system]

@@ -0,0 +1,41 @@
+import os
+import pytest
+from chromadbx.embeddings.nomic import NomicEmbeddingFunction
+
+httpx = pytest.importorskip("httpx", reason="nomic not installed")
+
+
+@pytest.mark.skipif(
+    os.getenv("NOMIC_API_KEY") is None,
+    reason="NOMIC_API_KEY environment variable is not set",
+)
+def test_nomic() -> None:
+    ef = NomicEmbeddingFunction()
+    embeddings = ef(["hello world", "goodbye world"])
+    assert len(embeddings) == 2
+    assert len(embeddings[0]) == 768
+    assert len(embeddings[1]) == 768
+
+
+@pytest.mark.skipif(
+    os.getenv("NOMIC_API_KEY") is None,
+    reason="NOMIC_API_KEY environment variable is not set",
+)
+def test_nomic_with_api_key() -> None:
+    ef = NomicEmbeddingFunction(api_key=os.getenv("NOMIC_API_KEY"))
+    embeddings = ef(["hello world", "goodbye world"])
+    assert len(embeddings) == 2
+    assert len(embeddings[0]) == 768
+    assert len(embeddings[1]) == 768
+
+
+@pytest.mark.skipif(
+    os.getenv("NOMIC_API_KEY") is None,
+    reason="NOMIC_API_KEY environment variable is not set",
+)
+def test_dimensionality() -> None:
+    ef = NomicEmbeddingFunction(dimensionality=512)
+    embeddings = ef(["hello world", "goodbye world"])
+    assert len(embeddings) == 2
+    assert len(embeddings[0]) == 512
+    assert len(embeddings[1]) == 512