example(lancedb): expose an env variable to control vector index

georgeh0 · georgeh0 · commit 3e96eefe52e8 · 2025-12-01T16:23:22.000-08:00
diff --git a/examples/text_embedding_lancedb/.env b/examples/text_embedding_lancedb/.env
@@ -4,3 +4,10 @@ COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
 # Fallback to CPU for operations not supported by MPS on Mac.
 # It's no-op for other platforms.
 PYTORCH_ENABLE_MPS_FALLBACK=1
+
+# By default, the vector index is not enabled, because LanceDB requires at least
+# 256 rows to be there before it can build the index (see
+# https://github.com/lance-format/lance/issues/4034) for more details).
+#
+# After your index has enough data, you can change the following value to `true` to enable the index:
+ENABLE_LANCEDB_VECTOR_INDEX=false
diff --git a/examples/text_embedding_lancedb/README.md b/examples/text_embedding_lancedb/README.md
@@ -46,6 +46,13 @@ You can also run the command with `-L`, which will watch for file changes and up
 cocoindex update -L main
 ```
 
+By default, the vector index is not enabled, because LanceDB requires at least 256 rows to be there before it can build the index (see [this issue](https://github.com/lance-format/lance/issues/4034) for more details).
+After your LanceDB target table has enough data, you can update `.env` file with the following environment variable to enable the vector index from there on:
+
+```sh
+ENABLE_LANCEDB_VECTOR_INDEX=true
+```
+
 ## CocoInsight
 
 I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
diff --git a/examples/text_embedding_lancedb/main.py b/examples/text_embedding_lancedb/main.py
@@ -1,4 +1,4 @@
-from dotenv import load_dotenv
+import os
 import datetime
 import cocoindex
 import math
@@ -31,8 +31,16 @@ def text_embedding_flow(
     """
     Define an example flow that embeds text into a vector database.
     """
+    ENABLE_LANCEDB_VECTOR_INDEX = os.environ.get(
+        "ENABLE_LANCEDB_VECTOR_INDEX", "0"
+    ).lower() in ("true", "1")
+
     data_scope["documents"] = flow_builder.add_source(
-        cocoindex.sources.LocalFile(path="markdown_files"),
+        cocoindex.sources.LocalFile(
+            path="../../",
+            included_patterns=["*.md", "*.mdx", "*.rs", "*.py"],
+            excluded_patterns=["**/.*", "target", "**/node_modules"],
+        ),
         refresh_interval=datetime.timedelta(seconds=5),
     )
 
@@ -57,18 +65,21 @@ def text_embedding_flow(
                 text_embedding=chunk["embedding"],
             )
 
+    # We cannot enable index when the table has no data yet, as LanceDB requires data to train the index.
+    # See: https://github.com/lancedb/lance/issues/4034
+    # Guard it with ENABLE_LANCEDB_VECTOR_INDEX environment variable.
+    vector_indexes = []
+    if ENABLE_LANCEDB_VECTOR_INDEX:
+        vector_indexes.append(
+            cocoindex.VectorIndexDef(
+                "text_embedding", cocoindex.VectorSimilarityMetric.L2_DISTANCE
+            )
+        )
     doc_embeddings.export(
         "doc_embeddings",
         coco_lancedb.LanceDB(db_uri=LANCEDB_URI, table_name=LANCEDB_TABLE),
         primary_key_fields=["id"],
-        # We cannot enable it when the table has no data yet, as LanceDB requires data to train the index.
-        # See: https://github.com/lancedb/lance/issues/4034
-        #
-        #   vector_indexes=[
-        #       cocoindex.VectorIndexDef(
-        #           "text_embedding", cocoindex.VectorSimilarityMetric.L2_DISTANCE
-        #       ),
-        #   ],
+        vector_indexes=vector_indexes,
     )