diff --git a/docs/docs/getting_started/quickstart.md b/docs/docs/getting_started/quickstart.md index 81dccdd2c..54a201902 100644 --- a/docs/docs/getting_started/quickstart.md +++ b/docs/docs/getting_started/quickstart.md @@ -21,7 +21,7 @@ We'll need to install a bunch of dependencies for this project. 1. Install CocoIndex: ```bash - pip install -U cocoindex + pip install -U 'cocoindex[embeddings]' ``` 2. You can skip this step if you already have a Postgres database with pgvector extension installed. diff --git a/docs/docs/ops/functions.md b/docs/docs/ops/functions.md index 44492753a..eebd4650a 100644 --- a/docs/docs/ops/functions.md +++ b/docs/docs/ops/functions.md @@ -77,6 +77,15 @@ Return: [*KTable*](/docs/core/data_types#ktable), each row represents a chunk, w `SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library. +:::note Optional Dependency Required + +This function requires the 'sentence-transformers' library, which is an optional dependency. Install CocoIndex with: + +```bash +pip install 'cocoindex[embeddings]' +``` +::: + The spec takes the following fields: * `model` (`str`): The name of the SentenceTransformer model to use. diff --git a/examples/amazon_s3_embedding/pyproject.toml b/examples/amazon_s3_embedding/pyproject.toml index 9de3cc2b1..c005d4e08 100644 --- a/examples/amazon_s3_embedding/pyproject.toml +++ b/examples/amazon_s3_embedding/pyproject.toml @@ -3,7 +3,7 @@ name = "amazon-s3-text-embedding" version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on Amazon S3 files." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.1.52", "python-dotenv>=1.0.1"] +dependencies = ["cocoindex[embeddings]>=0.1.52", "python-dotenv>=1.0.1"] [tool.setuptools] packages = [] diff --git a/examples/code_embedding/pyproject.toml b/examples/code_embedding/pyproject.toml index a3a222090..1c9dd5c76 100644 --- a/examples/code_embedding/pyproject.toml +++ b/examples/code_embedding/pyproject.toml @@ -3,7 +3,7 @@ name = "code-embedding" version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on source code." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.1.56", "python-dotenv>=1.0.1"] +dependencies = ["cocoindex[embeddings]>=0.1.56", "python-dotenv>=1.0.1"] [tool.setuptools] packages = [] diff --git a/examples/fastapi_server_docker/requirements.txt b/examples/fastapi_server_docker/requirements.txt index df7388bac..c2a108ad8 100644 --- a/examples/fastapi_server_docker/requirements.txt +++ b/examples/fastapi_server_docker/requirements.txt @@ -1,4 +1,4 @@ -cocoindex>=0.1.52 +cocoindex[embeddings]>=0.1.52 python-dotenv>=1.0.1 fastapi==0.115.12 fastapi-cli==0.0.7 diff --git a/examples/gdrive_text_embedding/pyproject.toml b/examples/gdrive_text_embedding/pyproject.toml index 90c49893c..16ef704cb 100644 --- a/examples/gdrive_text_embedding/pyproject.toml +++ b/examples/gdrive_text_embedding/pyproject.toml @@ -3,7 +3,7 @@ name = "gdrive-text-embedding" version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on Google Drive files." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.1.52", "python-dotenv>=1.0.1"] +dependencies = ["cocoindex[embeddings]>=0.1.52", "python-dotenv>=1.0.1"] [tool.setuptools] packages = [] diff --git a/examples/pdf_embedding/pyproject.toml b/examples/pdf_embedding/pyproject.toml index 8043a0740..7df6cae6c 100644 --- a/examples/pdf_embedding/pyproject.toml +++ b/examples/pdf_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local PDF files." requires-python = ">=3.11" dependencies = [ - "cocoindex>=0.1.52", + "cocoindex[embeddings]>=0.1.52", "python-dotenv>=1.0.1", "marker-pdf>=1.5.2", ] diff --git a/examples/text_embedding/pyproject.toml b/examples/text_embedding/pyproject.toml index 722771753..05003c98e 100644 --- a/examples/text_embedding/pyproject.toml +++ b/examples/text_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local text files." requires-python = ">=3.11" dependencies = [ - "cocoindex>=0.1.52", + "cocoindex[embeddings]>=0.1.52", "python-dotenv>=1.0.1", "pgvector>=0.4.1", "psycopg[binary,pool]", diff --git a/examples/text_embedding_qdrant/README.md b/examples/text_embedding_qdrant/README.md index b9309b82e..6a3795c07 100644 --- a/examples/text_embedding_qdrant/README.md +++ b/examples/text_embedding_qdrant/README.md @@ -37,6 +37,7 @@ We use Qdrant client to query the index, and reuse the embedding operation in th pip install -e . ``` + - Setup: ```bash diff --git a/examples/text_embedding_qdrant/pyproject.toml b/examples/text_embedding_qdrant/pyproject.toml index ec2e8cadb..51c6fe2d2 100644 --- a/examples/text_embedding_qdrant/pyproject.toml +++ b/examples/text_embedding_qdrant/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local text files." requires-python = ">=3.11" dependencies = [ - "cocoindex>=0.1.52", + "cocoindex[embeddings]>=0.1.52", "python-dotenv>=1.0.1", "qdrant-client>=1.6.0", ] diff --git a/pyproject.toml b/pyproject.toml index 1d0e3f881..edff1091e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,6 @@ authors = [{ name = "CocoIndex", email = "cocoindex.io@gmail.com" }] readme = "README.md" requires-python = ">=3.11" dependencies = [ - "sentence-transformers>=3.3.1", "click>=8.1.8", "rich>=14.0.0", "python-dotenv>=1.1.0", @@ -31,6 +30,7 @@ features = ["pyo3/extension-module"] [project.optional-dependencies] test = ["pytest"] dev = ["ruff", "pre-commit"] +embeddings = ["sentence-transformers>=3.3.1"] [tool.mypy] python_version = "3.11" diff --git a/python/cocoindex/functions.py b/python/cocoindex/functions.py index 8ece16045..58f41eb24 100644 --- a/python/cocoindex/functions.py +++ b/python/cocoindex/functions.py @@ -12,6 +12,14 @@ if TYPE_CHECKING: import sentence_transformers +# Check if sentence_transformers is available +try: + import sentence_transformers + + _SENTENCE_TRANSFORMERS_AVAILABLE = True +except ImportError: + _SENTENCE_TRANSFORMERS_AVAILABLE = False + class ParseJson(op.FunctionSpec): """Parse a text into a JSON object.""" @@ -58,6 +66,10 @@ class SentenceTransformerEmbed(op.FunctionSpec): model: The name of the SentenceTransformer model to use. args: Additional arguments to pass to the SentenceTransformer constructor. e.g. {"trust_remote_code": True} + + Note: + This function requires the optional sentence-transformers dependency. + Install it with: pip install 'cocoindex[embeddings]' """ model: str @@ -72,6 +84,14 @@ class SentenceTransformerEmbedExecutor: _model: "sentence_transformers.SentenceTransformer" def analyze(self, text: Any) -> type: + if not _SENTENCE_TRANSFORMERS_AVAILABLE: + raise ImportError( + "sentence_transformers is required for SentenceTransformerEmbed function. " + "Install it with one of these commands:\n" + " pip install 'cocoindex[embeddings]'\n" + " pip install sentence-transformers" + ) + import sentence_transformers # pylint: disable=import-outside-toplevel args = self.spec.args or {}