diff --git a/chromadb/test/ef/test_ef.py b/chromadb/test/ef/test_ef.py index 264284ecebe..186d8b6ff19 100644 --- a/chromadb/test/ef/test_ef.py +++ b/chromadb/test/ef/test_ef.py @@ -39,6 +39,7 @@ def test_get_builtins_holds() -> None: "JinaEmbeddingFunction", "MistralEmbeddingFunction", "MorphEmbeddingFunction", + "NomicEmbeddingFunction", "ONNXMiniLM_L6_V2", "OllamaEmbeddingFunction", "OpenAIEmbeddingFunction", diff --git a/chromadb/utils/embedding_functions/__init__.py b/chromadb/utils/embedding_functions/__init__.py index f2ca5ecb9a3..2622b6f0e5e 100644 --- a/chromadb/utils/embedding_functions/__init__.py +++ b/chromadb/utils/embedding_functions/__init__.py @@ -68,6 +68,10 @@ from chromadb.utils.embedding_functions.morph_embedding_function import ( MorphEmbeddingFunction, ) +from chromadb.utils.embedding_functions.nomic_embedding_function import ( + NomicEmbeddingFunction, + NomicQueryConfig, +) from chromadb.utils.embedding_functions.huggingface_sparse_embedding_function import ( HuggingFaceSparseEmbeddingFunction, ) @@ -103,6 +107,7 @@ "JinaEmbeddingFunction", "MistralEmbeddingFunction", "MorphEmbeddingFunction", + "NomicEmbeddingFunction", "VoyageAIEmbeddingFunction", "ONNXMiniLM_L6_V2", "OpenCLIPEmbeddingFunction", @@ -142,6 +147,7 @@ def get_builtins() -> Set[str]: "jina": JinaEmbeddingFunction, "mistral": MistralEmbeddingFunction, "morph": MorphEmbeddingFunction, + "nomic": NomicEmbeddingFunction, "voyageai": VoyageAIEmbeddingFunction, "onnx_mini_lm_l6_v2": ONNXMiniLM_L6_V2, "open_clip": OpenCLIPEmbeddingFunction, @@ -265,6 +271,8 @@ def config_to_embedding_function(config: Dict[str, Any]) -> EmbeddingFunction: "JinaQueryConfig", "MistralEmbeddingFunction", "MorphEmbeddingFunction", + "NomicEmbeddingFunction", + "NomicQueryConfig", "VoyageAIEmbeddingFunction", "ONNXMiniLM_L6_V2", "OpenCLIPEmbeddingFunction", diff --git a/chromadb/utils/embedding_functions/nomic_embedding_function.py b/chromadb/utils/embedding_functions/nomic_embedding_function.py new file mode 100644 index 00000000000..c65c5864d0e --- /dev/null +++ b/chromadb/utils/embedding_functions/nomic_embedding_function.py @@ -0,0 +1,129 @@ +from chromadb.api.types import ( + Embeddings, + Documents, + EmbeddingFunction, + Space, +) +from chromadb.utils.embedding_functions.schemas import validate_config_schema +from typing import List, Dict, Any, TypedDict, Optional +import os +import numpy as np + + +class NomicQueryConfig(TypedDict): + task_type: str + + +class NomicEmbeddingFunction(EmbeddingFunction[Documents]): + """ + This class is used to get embeddings for a list of texts using the Nomic API. + """ + + def __init__( + self, + model: str, + task_type: str, + query_config: Optional[NomicQueryConfig], + api_key_env_var: str = "NOMIC_API_KEY", + ): + """ + Initialize the NomicEmbeddingFunction. + + Args: + model (str): The name of the model to use for text embeddings. + task_type (str): The type of task to embed with. See reference https://docs.nomic.ai/platform/embeddings-and-retrieval/text-embedding#embedding-task-types + query_config (Optional[NomicQueryConfig]): The configuration for setting task type for queries + api_key_env_var (str): The environment variable name for the Nomic API key. Defaults to "NOMIC_API_KEY". + + Supported task types: search_document, search_query, classification, clustering + """ + try: + from nomic import embed + except ImportError: + raise ValueError( + "The nomic python package is not installed. Please install it with `pip install nomic`" + ) + + self.model = model + self.task_type = task_type + self.api_key_env_var = api_key_env_var + self.api_key = os.getenv(api_key_env_var) + self.query_config = query_config + if not self.api_key: + raise ValueError(f"The {api_key_env_var} environment variable is not set.") + self.embed = embed + + def __call__(self, input: Documents) -> Embeddings: + if not all(isinstance(item, str) for item in input): + raise ValueError("Nomic only supports text documents, not images") + output = self.embed.text( + model=self.model, + texts=input, + task_type=self.task_type, + ) + return [np.array(data.embedding) for data in output.data] + + def embed_query(self, input: Documents) -> Embeddings: + if not all(isinstance(item, str) for item in input): + raise ValueError("Nomic only supports text queries, not images") + + task_type = ( + self.query_config.get("task_type") if self.query_config else self.task_type + ) + output = self.embed.text( + model=self.model, + texts=input, + task_type=task_type, + ) + return [np.array(data.embedding) for data in output.data] + + @staticmethod + def name() -> str: + return "nomic" + + def default_space(self) -> Space: + return "cosine" + + def supported_spaces(self) -> List[Space]: + return ["cosine", "l2", "ip"] + + @staticmethod + def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Documents]": + model = config.get("model") + api_key_env_var = config.get("api_key_env_var") + task_type = config.get("task_type") + query_config = config.get("query_config") + if model is None or api_key_env_var is None or task_type is None: + assert False, "This code should not be reached" # this is for type checking + return NomicEmbeddingFunction( + model=model, + api_key_env_var=api_key_env_var, + task_type=task_type, + query_config=query_config, + ) + + def get_config(self) -> Dict[str, Any]: + return { + "model": self.model, + "api_key_env_var": self.api_key_env_var, + "task_type": self.task_type, + "query_config": self.query_config, + } + + def validate_config_update( + self, old_config: Dict[str, Any], new_config: Dict[str, Any] + ) -> None: + if "model" in new_config: + raise ValueError( + "The model cannot be changed after the embedding function has been initialized." + ) + + @staticmethod + def validate_config(config: Dict[str, Any]) -> None: + """ + Validate the configuration using the JSON schema. + + Args: + config: Configuration to validate + """ + validate_config_schema(config, "nomic") diff --git a/schemas/embedding_functions/nomic.json b/schemas/embedding_functions/nomic.json new file mode 100644 index 00000000000..d20722a9d48 --- /dev/null +++ b/schemas/embedding_functions/nomic.json @@ -0,0 +1,21 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Nomic Embedding Function Schema", + "description": "Schema for the Nomic embedding function configuration", + "version": "1.0.0", + "type": "object", + "properties": { + "model": { + "type": "string", + "description": "Parameter model for the Nomic embedding function" + }, + "api_key_env_var": { + "type": "string", + "description": "Parameter api_key_env_var for the Nomic embedding function" + }, + "task_type": { + "type": "string", + "description": "Parameter task_type for the Nomic embedding function" + } + } +}