docs: examples/text_embedding_qdrant

Anush008 · Anush008 · commit 86fc14c5368a · 2025-04-14T12:39:00.000+05:30
Signed-off-by: Anush008 &lt;anushshetty90@gmail.com&gt;
diff --git a/docs/docs/ops/storages.md b/docs/docs/ops/storages.md
@@ -27,8 +27,21 @@ The spec takes the following fields:
 
 *   `api_key` (type: `str`, optional). API key to authenticate requests with.
 
-The field name for the vector embeddings must match the [vector name](https://qdrant.tech/documentation/concepts/vectors/#named-vectors) used when the collection was created.
-
-If no primary key is set during export, a random UUID is used as the Qdrant point ID.
-
-You can find an end-to-end example [here](https://github.com/cocoindex-io/cocoindex/tree/main/examples/text_embedding).
+Before exporting, you must create a collection with a [vector name](https://qdrant.tech/documentation/concepts/vectors/#named-vectors) that matches the vector field name in CocoIndex, and set `setup_by_user=True` during export.
+
+Example:
+
+```python
+doc_embeddings.export(
+    "doc_embeddings",
+    cocoindex.storages.Qdrant(
+        collection_name="cocoindex",
+        grpc_url="http://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6334/",
+        api_key="<your-api-key-here>",
+    ),
+    primary_key_fields=["id_field"],
+    setup_by_user=True,
+)
+```
+
+You can find an end-to-end example [here](https://github.com/cocoindex-io/cocoindex/tree/main/examples/text_embedding_qdrant).
diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md
@@ -1,34 +1,7 @@
-## Description
+Simple example for cocoindex: build embedding index based on local files.
 
-Example to build a vector index in Qdrant based on local files.
-
-## Pre-requisites
-
-- [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
-
-- Run Qdrant.
-
-```bash
-docker run -d -p 6334:6334 -p 6333:6333 qdrant/qdrant
-```
-
-- [Create a collection](https://qdrant.tech/documentation/concepts/vectors/#named-vectors) to export the embeddings to.
-
-```bash
-curl  -X PUT \
-  'http://localhost:6333/collections/cocoindex' \
-  --header 'Content-Type: application/json' \
-  --data-raw '{
-  "vectors": {
-    "text_embedding": {
-      "size": 384,
-      "distance": "Cosine"
-    }
-  }
-}'
-```
-
-You can view the collections and data with the Qdrant dashboard at <http://localhost:6333/dashboard>.
+## Prerequisite
+[Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
 
 ## Run
 
@@ -56,14 +29,13 @@ Run:
 python main.py
 ```
 
-## CocoInsight
-
+## CocoInsight 
 CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
 
 Run CocoInsight to understand your RAG data pipeline:
 
-```bash
+```
 python main.py cocoindex server -c https://cocoindex.io
 ```
 
-Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
+Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
diff --git a/examples/text_embedding/main.py b/examples/text_embedding/main.py
@@ -2,79 +2,57 @@
 
 import cocoindex
 
-
 def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
     """
     Embed the text using a SentenceTransformer model.
     This is a shared logic between indexing and querying, so extract it as a function.
     """
     return text.transform(
         cocoindex.functions.SentenceTransformerEmbed(
-            model="sentence-transformers/all-MiniLM-L6-v2"
-        )
-    )
-
+            model="sentence-transformers/all-MiniLM-L6-v2"))
 
 @cocoindex.flow_def(name="TextEmbedding")
-def text_embedding_flow(
-    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
     """
     Define an example flow that embeds text into a vector database.
     """
     data_scope["documents"] = flow_builder.add_source(
-        cocoindex.sources.LocalFile(path="markdown_files")
-    )
+        cocoindex.sources.LocalFile(path="markdown_files"))
 
     doc_embeddings = data_scope.add_collector()
 
     with data_scope["documents"].row() as doc:
         doc["chunks"] = doc["content"].transform(
             cocoindex.functions.SplitRecursively(),
-            language="markdown",
-            chunk_size=2000,
-            chunk_overlap=500,
-        )
+            language="markdown", chunk_size=2000, chunk_overlap=500)
 
         with doc["chunks"].row() as chunk:
             chunk["embedding"] = text_to_embedding(chunk["text"])
-            doc_embeddings.collect(
-                id=cocoindex.GeneratedField.UUID,
-                filename=doc["filename"],
-                location=chunk["location"],
-                text=chunk["text"],
-                # 'text_embedding' is the name of the vector we've created the Qdrant collection with.
-                text_embedding=chunk["embedding"],
-            )
+            doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
+                                   text=chunk["text"], embedding=chunk["embedding"])
 
     doc_embeddings.export(
         "doc_embeddings",
-        cocoindex.storages.Qdrant(
-            collection_name="cocoindex", grpc_url="http://localhost:6334/"
-        ),
-        primary_key_fields=["id"],
-        setup_by_user=True,
-    )
-
+        cocoindex.storages.Postgres(),
+        primary_key_fields=["filename", "location"],
+        vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
 
 query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
     name="SemanticsSearch",
     flow=text_embedding_flow,
     target_name="doc_embeddings",
     query_transform_flow=text_to_embedding,
-    default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
-)
-
+    default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
 
 @cocoindex.main_fn()
 def _run():
     # Run queries in a loop to demonstrate the query capabilities.
     while True:
         try:
             query = input("Enter search query (or Enter to quit): ")
-            if query == "":
+            if query == '':
                 break
-            results, _ = query_handler.search(query, 10, "text_embedding")
+            results, _ = query_handler.search(query, 10)
             print("\nSearch results:")
             for result in results:
                 print(f"[{result.score:.3f}] {result.data['filename']}")
@@ -84,7 +62,6 @@ def _run():
         except KeyboardInterrupt:
             break
 
-
 if __name__ == "__main__":
     load_dotenv(override=True)
-    _run()
+    _run()
diff --git a/examples/text_embedding/pyproject.toml b/examples/text_embedding/pyproject.toml
@@ -3,4 +3,4 @@ name = "text-embedding"
 version = "0.1.0"
 description = "Simple example for cocoindex: build embedding index based on local text files."
 requires-python = ">=3.10"
-dependencies = ["cocoindex>=0.1.19", "python-dotenv>=1.0.1"]
+dependencies = ["cocoindex>=0.1.19", "python-dotenv>=1.0.1"]
diff --git a/examples/text_embedding_qdrant/.env b/examples/text_embedding_qdrant/.env
@@ -0,0 +1,2 @@
+# Postgres database address for cocoindex
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
diff --git a/examples/text_embedding_qdrant/README.md b/examples/text_embedding_qdrant/README.md
@@ -0,0 +1,69 @@
+## Description
+
+Example to build a vector index in Qdrant based on local files.
+
+## Pre-requisites
+
+- [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
+
+- Run Qdrant.
+
+```bash
+docker run -d -p 6334:6334 -p 6333:6333 qdrant/qdrant
+```
+
+- [Create a collection](https://qdrant.tech/documentation/concepts/vectors/#named-vectors) to export the embeddings to.
+
+```bash
+curl  -X PUT \
+  'http://localhost:6333/collections/cocoindex' \
+  --header 'Content-Type: application/json' \
+  --data-raw '{
+  "vectors": {
+    "text_embedding": {
+      "size": 384,
+      "distance": "Cosine"
+    }
+  }
+}'
+```
+
+You can view the collections and data with the Qdrant dashboard at <http://localhost:6333/dashboard>.
+
+## Run
+
+Install dependencies:
+
+```bash
+pip install -e .
+```
+
+Setup:
+
+```bash
+python main.py cocoindex setup
+```
+
+Update index:
+
+```bash
+python main.py cocoindex update
+```
+
+Run:
+
+```bash
+python main.py
+```
+
+## CocoInsight
+
+CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
+
+Run CocoInsight to understand your RAG data pipeline:
+
+```bash
+python main.py cocoindex server -c https://cocoindex.io
+```
+
+Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
diff --git a/examples/text_embedding_qdrant/main.py b/examples/text_embedding_qdrant/main.py
@@ -0,0 +1,90 @@
+from dotenv import load_dotenv
+
+import cocoindex
+
+
+def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
+    """
+    Embed the text using a SentenceTransformer model.
+    This is a shared logic between indexing and querying, so extract it as a function.
+    """
+    return text.transform(
+        cocoindex.functions.SentenceTransformerEmbed(
+            model="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    )
+
+
+@cocoindex.flow_def(name="TextEmbedding")
+def text_embedding_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+):
+    """
+    Define an example flow that embeds text into a vector database.
+    """
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="markdown_files")
+    )
+
+    doc_embeddings = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        doc["chunks"] = doc["content"].transform(
+            cocoindex.functions.SplitRecursively(),
+            language="markdown",
+            chunk_size=2000,
+            chunk_overlap=500,
+        )
+
+        with doc["chunks"].row() as chunk:
+            chunk["embedding"] = text_to_embedding(chunk["text"])
+            doc_embeddings.collect(
+                id=cocoindex.GeneratedField.UUID,
+                filename=doc["filename"],
+                location=chunk["location"],
+                text=chunk["text"],
+                # 'text_embedding' is the name of the vector we've created the Qdrant collection with.
+                text_embedding=chunk["embedding"],
+            )
+
+    doc_embeddings.export(
+        "doc_embeddings",
+        cocoindex.storages.Qdrant(
+            collection_name="cocoindex", grpc_url="http://localhost:6334/"
+        ),
+        primary_key_fields=["id"],
+        setup_by_user=True,
+    )
+
+
+query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
+    name="SemanticsSearch",
+    flow=text_embedding_flow,
+    target_name="doc_embeddings",
+    query_transform_flow=text_to_embedding,
+    default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
+)
+
+
+@cocoindex.main_fn()
+def _run():
+    # Run queries in a loop to demonstrate the query capabilities.
+    while True:
+        try:
+            query = input("Enter search query (or Enter to quit): ")
+            if query == "":
+                break
+            results, _ = query_handler.search(query, 10, "text_embedding")
+            print("\nSearch results:")
+            for result in results:
+                print(f"[{result.score:.3f}] {result.data['filename']}")
+                print(f"    {result.data['text']}")
+                print("---")
+            print()
+        except KeyboardInterrupt:
+            break
+
+
+if __name__ == "__main__":
+    load_dotenv(override=True)
+    _run()
diff --git a/examples/text_embedding_qdrant/markdown_files/rfc8259.md b/examples/text_embedding_qdrant/markdown_files/rfc8259.md
diff --git a/examples/text_embedding_qdrant/pyproject.toml b/examples/text_embedding_qdrant/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Postgres database address for cocoindex`
	`2`	`+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex`