example(lancedb): add lancedb target example (#1042)

georgeh0 · web-flow · commit 89be4970f4cf · 2025-09-23T21:31:41.000-07:00
* example(lancedb): add lancedb target example

* chore: update `/README.md`
diff --git a/README.md b/README.md
@@ -181,6 +181,7 @@ It defines an index flow like this:
 | [Google Drive Text Embedding](examples/gdrive_text_embedding) | Index text documents from Google Drive |
 | [Docs to Knowledge Graph](examples/docs_to_knowledge_graph) | Extract relationships from Markdown documents and build a knowledge graph |
 | [Embeddings to Qdrant](examples/text_embedding_qdrant) | Index documents in a Qdrant collection for semantic search |
+| [Embeddings to LanceDB](examples/text_embedding_lancedb) | Index documents in a LanceDB collection for semantic search |
 | [FastAPI Server with Docker](examples/fastapi_server_docker) | Run the semantic search server in a Dockerized FastAPI setup |
 | [Product Recommendation](examples/product_recommendation) | Build real-time product recommendations with LLM and graph database|
 | [Image Search with Vision API](examples/image_search) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend|
diff --git a/examples/text_embedding_lancedb/.env b/examples/text_embedding_lancedb/.env
@@ -0,0 +1,6 @@
+# Postgres database address for cocoindex
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
+
+# Fallback to CPU for operations not supported by MPS on Mac.
+# It's no-op for other platforms.
+PYTORCH_ENABLE_MPS_FALLBACK=1
diff --git a/examples/text_embedding_lancedb/.gitignore b/examples/text_embedding_lancedb/.gitignore
@@ -0,0 +1 @@
+/lancedb_data
diff --git a/examples/text_embedding_lancedb/README.md b/examples/text_embedding_lancedb/README.md
@@ -0,0 +1,58 @@
+# Build text embedding and semantic search 🔍 with LanceDB
+
+[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
+
+CocoIndex supports LanceDB natively. In this example, we will build index flow from text embedding from local markdown files, and query the index. We will use **LanceDB** as the vector database.
+
+We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
+
+
+## Steps
+### Indexing Flow
+
+1.  We will ingest a list of local files.
+2.  For each file, perform chunking (recursively split) and then embedding.
+3.  We will save the embeddings and the metadata in LanceDB.
+
+### Query
+
+1.  We have `search()` as a [query handler](https://cocoindex.io/docs/query#query-handler), to query the LanceDB table with LanceDB client.
+2.  We share the embedding operation `text_to_embedding()` between indexing and querying,
+  by wrapping it as a [transform flow](https://cocoindex.io/docs/query#transform-flow).
+
+## Pre-requisites
+
+1.  [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. Although the target store is LanceDB, CocoIndex uses Postgres to track the data lineage for incremental processing.
+
+2.  Install dependencies:
+
+    ```sh
+    pip install -e .
+    ```
+
+LanceDB will automatically create a local database directory when you run the example (no additional setup required).
+
+## Run
+
+Update index, which will also setup LanceDB tables at the first time:
+
+```bash
+cocoindex update --setup main
+```
+
+You can also run the command with `-L`, which will watch for file changes and update the index automatically.
+
+```bash
+cocoindex update --setup -L main
+```
+
+## CocoInsight
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
+It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
+
+```bash
+cocoindex server -ci -L main
+```
+
+Open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
+You can run queries in the CocoInsight UI.
diff --git a/examples/text_embedding_lancedb/main.py b/examples/text_embedding_lancedb/main.py
@@ -0,0 +1,109 @@
+from dotenv import load_dotenv
+import datetime
+import cocoindex
+import math
+import cocoindex.targets.lancedb as coco_lancedb
+
+# Define LanceDB connection constants
+LANCEDB_URI = "./lancedb_data"  # Local directory for LanceDB
+LANCEDB_TABLE = "TextEmbedding"
+
+
+@cocoindex.transform_flow()
+def text_to_embedding(
+    text: cocoindex.DataSlice[str],
+) -> cocoindex.DataSlice[list[float]]:
+    """
+    Embed the text using a SentenceTransformer model.
+    This is a shared logic between indexing and querying, so extract it as a function.
+    """
+    return text.transform(
+        cocoindex.functions.SentenceTransformerEmbed(
+            model="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    )
+
+
+@cocoindex.flow_def(name="TextEmbeddingWithLanceDB")
+def text_embedding_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+) -> None:
+    """
+    Define an example flow that embeds text into a vector database.
+    """
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="markdown_files"),
+        refresh_interval=datetime.timedelta(seconds=5),
+    )
+
+    doc_embeddings = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        doc["chunks"] = doc["content"].transform(
+            cocoindex.functions.SplitRecursively(),
+            language="markdown",
+            chunk_size=500,
+            chunk_overlap=100,
+        )
+
+        with doc["chunks"].row() as chunk:
+            chunk["embedding"] = text_to_embedding(chunk["text"])
+            doc_embeddings.collect(
+                id=cocoindex.GeneratedField.UUID,
+                filename=doc["filename"],
+                location=chunk["location"],
+                text=chunk["text"],
+                # 'text_embedding' is the name of the vector we've created the LanceDB table with.
+                text_embedding=chunk["embedding"],
+            )
+
+    doc_embeddings.export(
+        "doc_embeddings",
+        coco_lancedb.LanceDB(db_uri=LANCEDB_URI, table_name=LANCEDB_TABLE),
+        primary_key_fields=["id"],
+        # We cannot enable it when the table has no data yet, as LanceDB requires data to train the index.
+        # See: https://github.com/lancedb/lance/issues/4034
+        #
+        #   vector_indexes=[
+        #       cocoindex.VectorIndexDef(
+        #           "text_embedding", cocoindex.VectorSimilarityMetric.L2_DISTANCE
+        #       ),
+        #   ],
+    )
+
+
+@text_embedding_flow.query_handler(
+    result_fields=cocoindex.QueryHandlerResultFields(
+        embedding=["embedding"],
+        score="score",
+    ),
+)
+async def search(query: str) -> cocoindex.QueryOutput:
+    print("Searching...", query)
+    db = await coco_lancedb.connect_async(LANCEDB_URI)
+    table = await db.open_table(LANCEDB_TABLE)
+
+    # Get the embedding for the query
+    query_embedding = await text_to_embedding.eval_async(query)
+
+    search = await table.search(query_embedding, vector_column_name="text_embedding")
+    search_results = await search.limit(5).to_list()
+
+    print(search_results)
+
+    return cocoindex.QueryOutput(
+        results=[
+            {
+                "filename": result["filename"],
+                "text": result["text"],
+                "embedding": result["text_embedding"],
+                # Qdrant's L2 "distance" is squared, so we take the square root to align with normal L2 distance
+                "score": math.sqrt(result["_distance"]),
+            }
+            for result in search_results
+        ],
+        query_info=cocoindex.QueryInfo(
+            embedding=query_embedding,
+            similarity_metric=cocoindex.VectorSimilarityMetric.L2_DISTANCE,
+        ),
+    )
diff --git a/examples/text_embedding_lancedb/markdown_files/rfc8259.md b/examples/text_embedding_lancedb/markdown_files/rfc8259.md
diff --git a/examples/text_embedding_lancedb/pyproject.toml b/examples/text_embedding_lancedb/pyproject.toml