Skip to content

Commit 3e96eef

Browse files
committed
example(lancedb): expose an env variable to control vector index
1 parent b53e2fa commit 3e96eef

File tree

3 files changed

+35
-10
lines changed

3 files changed

+35
-10
lines changed

examples/text_embedding_lancedb/.env

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,10 @@ COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
44
# Fallback to CPU for operations not supported by MPS on Mac.
55
# It's no-op for other platforms.
66
PYTORCH_ENABLE_MPS_FALLBACK=1
7+
8+
# By default, the vector index is not enabled, because LanceDB requires at least
9+
# 256 rows to be there before it can build the index (see
10+
# https://github.com/lance-format/lance/issues/4034) for more details).
11+
#
12+
# After your index has enough data, you can change the following value to `true` to enable the index:
13+
ENABLE_LANCEDB_VECTOR_INDEX=false

examples/text_embedding_lancedb/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,13 @@ You can also run the command with `-L`, which will watch for file changes and up
4646
cocoindex update -L main
4747
```
4848

49+
By default, the vector index is not enabled, because LanceDB requires at least 256 rows to be there before it can build the index (see [this issue](https://github.com/lance-format/lance/issues/4034) for more details).
50+
After your LanceDB target table has enough data, you can update `.env` file with the following environment variable to enable the vector index from there on:
51+
52+
```sh
53+
ENABLE_LANCEDB_VECTOR_INDEX=true
54+
```
55+
4956
## CocoInsight
5057

5158
I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.

examples/text_embedding_lancedb/main.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from dotenv import load_dotenv
1+
import os
22
import datetime
33
import cocoindex
44
import math
@@ -31,8 +31,16 @@ def text_embedding_flow(
3131
"""
3232
Define an example flow that embeds text into a vector database.
3333
"""
34+
ENABLE_LANCEDB_VECTOR_INDEX = os.environ.get(
35+
"ENABLE_LANCEDB_VECTOR_INDEX", "0"
36+
).lower() in ("true", "1")
37+
3438
data_scope["documents"] = flow_builder.add_source(
35-
cocoindex.sources.LocalFile(path="markdown_files"),
39+
cocoindex.sources.LocalFile(
40+
path="../../",
41+
included_patterns=["*.md", "*.mdx", "*.rs", "*.py"],
42+
excluded_patterns=["**/.*", "target", "**/node_modules"],
43+
),
3644
refresh_interval=datetime.timedelta(seconds=5),
3745
)
3846

@@ -57,18 +65,21 @@ def text_embedding_flow(
5765
text_embedding=chunk["embedding"],
5866
)
5967

68+
# We cannot enable index when the table has no data yet, as LanceDB requires data to train the index.
69+
# See: https://github.com/lancedb/lance/issues/4034
70+
# Guard it with ENABLE_LANCEDB_VECTOR_INDEX environment variable.
71+
vector_indexes = []
72+
if ENABLE_LANCEDB_VECTOR_INDEX:
73+
vector_indexes.append(
74+
cocoindex.VectorIndexDef(
75+
"text_embedding", cocoindex.VectorSimilarityMetric.L2_DISTANCE
76+
)
77+
)
6078
doc_embeddings.export(
6179
"doc_embeddings",
6280
coco_lancedb.LanceDB(db_uri=LANCEDB_URI, table_name=LANCEDB_TABLE),
6381
primary_key_fields=["id"],
64-
# We cannot enable it when the table has no data yet, as LanceDB requires data to train the index.
65-
# See: https://github.com/lancedb/lance/issues/4034
66-
#
67-
# vector_indexes=[
68-
# cocoindex.VectorIndexDef(
69-
# "text_embedding", cocoindex.VectorSimilarityMetric.L2_DISTANCE
70-
# ),
71-
# ],
82+
vector_indexes=vector_indexes,
7283
)
7384

7485

0 commit comments

Comments
 (0)