Skip to content

Commit 457c710

Browse files
authored
Update vector_index for export() to take list of VectorIndexDef. (#304)
1 parent e8a6290 commit 457c710

File tree

15 files changed

+69
-43
lines changed

15 files changed

+69
-43
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
8080
"doc_embeddings",
8181
cocoindex.storages.Postgres(),
8282
primary_key_fields=["filename", "location"],
83-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
83+
vector_indexes=[
84+
cocoindex.VectorIndexDef(
85+
field_name="embedding",
86+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
8487
```
8588

8689
It defines an index flow like this:

docs/docs/core/flow_def.mdx

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,10 @@ Export must happen at the top level of a flow, i.e. not within any child scopes
259259

260260
* `name`: the name to identify the export target.
261261
* `target_spec`: the storage spec as the export target.
262-
* `primary_key_fields` (optional): the fields to be used as primary key. Types of the fields must be supported as key fields. See [Key Types](data_types#key-types) for more details.
263-
* `vector_index` (optional): the fields to create vector index. Each item is a tuple of a field name and a similarity metric. See [Vector Type](data_types#vector-type) for more details about supported similarity metrics.
262+
* `primary_key_fields` (`Sequence[str]`): the fields to be used as primary key. Types of the fields must be supported as key fields. See [Key Types](data_types#key-types) for more details.
263+
* `vector_indexes` (`Sequence[VectorIndexDef]`, optional): the fields to create vector index. `VectorIndexDef` has the following fields:
264+
* `field_name`: the field to create vector index.
265+
* `metric`: the similarity metric to use. See [Vector Type](data_types#vector-type) for more details about supported similarity metrics.
264266
* `setup_by_user` (optional):
265267
whether the export target is setup by user.
266268
By default, CocoIndex is managing the target setup (surfaced by the `cocoindex setup` CLI subcommand), e.g. create related tables/collections/etc. with compatible schema, and update them upon change.

docs/docs/getting_started/quickstart.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
9797
"doc_embeddings",
9898
cocoindex.storages.Postgres(),
9999
primary_key_fields=["filename", "location"],
100-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
100+
vector_indexes=[
101+
cocoindex.VectorIndexDef(
102+
field_name="embedding",
103+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
101104
```
102105

103106
Notes:

examples/code_embedding/main.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
4141
"code_embeddings",
4242
cocoindex.storages.Postgres(),
4343
primary_key_fields=["filename", "location"],
44-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
44+
vector_indexes=[
45+
cocoindex.VectorIndexDef(
46+
field_name="embedding",
47+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
4548

4649

4750
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(

examples/docs_to_kg/main.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -93,15 +93,13 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
9393
),
9494
nodes={
9595
"Entity": cocoindex.storages.Neo4jRelationshipNodeSpec(
96-
index_options=cocoindex.IndexOptions(
97-
primary_key_fields=["value"],
98-
vector_index_defs=[
99-
cocoindex.VectorIndexDef(
100-
field_name="embedding",
101-
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
102-
),
103-
],
104-
),
96+
primary_key_fields=["value"],
97+
vector_indexes=[
98+
cocoindex.VectorIndexDef(
99+
field_name="embedding",
100+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
101+
),
102+
],
105103
),
106104
},
107105
),

examples/gdrive_text_embedding/main.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@ def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope:
3737
"doc_embeddings",
3838
cocoindex.storages.Postgres(),
3939
primary_key_fields=["filename", "location"],
40-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
40+
vector_indexes=[
41+
cocoindex.VectorIndexDef(
42+
field_name="embedding",
43+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
4144

4245
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
4346
name="SemanticsSearch",

examples/pdf_embedding/main.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,10 @@ def pdf_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoinde
6363
"doc_embeddings",
6464
cocoindex.storages.Postgres(),
6565
primary_key_fields=["id"],
66-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
66+
vector_indexes=[
67+
cocoindex.VectorIndexDef(
68+
field_name="embedding",
69+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
6770

6871
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
6972
name="SemanticsSearch",

examples/text_embedding/main.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
3535
"doc_embeddings",
3636
cocoindex.storages.Postgres(),
3737
primary_key_fields=["filename", "location"],
38-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
38+
vector_indexes=[
39+
cocoindex.VectorIndexDef(
40+
field_name="embedding",
41+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
3942

4043
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
4144
name="SemanticsSearch",

python/cocoindex/flow.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -267,21 +267,27 @@ def collect(self, **kwargs):
267267
self._engine_data_collector, regular_kwargs, auto_uuid_field)
268268

269269
def export(self, name: str, target_spec: op.StorageSpec, /, *,
270-
primary_key_fields: Sequence[str] | None = None,
270+
primary_key_fields: Sequence[str],
271+
vector_indexes: Sequence[index.VectorIndexDef] = (),
271272
vector_index: Sequence[tuple[str, index.VectorSimilarityMetric]] = (),
272273
setup_by_user: bool = False):
273274
"""
274275
Export the collected data to the specified target.
276+
277+
`vector_index` is for backward compatibility only. Please use `vector_indexes` instead.
275278
"""
276-
index_options: dict[str, Any] = {}
277-
if primary_key_fields is not None:
278-
index_options["primary_key_fields"] = primary_key_fields
279-
index_options["vector_index_defs"] = [
280-
{"field_name": field_name, "metric": metric.value}
281-
for field_name, metric in vector_index]
279+
# For backward compatibility only.
280+
if len(vector_indexes) == 0 and len(vector_index) > 0:
281+
vector_indexes = [index.VectorIndexDef(field_name=field_name, metric=metric)
282+
for field_name, metric in vector_index]
283+
284+
index_options = index.IndexOptions(
285+
primary_key_fields=primary_key_fields,
286+
vector_indexes=vector_indexes,
287+
)
282288
self._flow_builder_state.engine_flow_builder.export(
283289
name, _spec_kind(target_spec), dump_engine_object(target_spec),
284-
index_options, self._engine_data_collector, setup_by_user)
290+
dump_engine_object(index_options), self._engine_data_collector, setup_by_user)
285291

286292

287293
_flow_name_builder = _NameBuilder()

python/cocoindex/index.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from enum import Enum
22
from dataclasses import dataclass
3-
3+
from typing import Sequence
44
class VectorSimilarityMetric(Enum):
55
COSINE_SIMILARITY = "CosineSimilarity"
66
L2_DISTANCE = "L2Distance"
@@ -19,5 +19,5 @@ class IndexOptions:
1919
"""
2020
Options for an index.
2121
"""
22-
primary_key_fields: list[str] | None = None
23-
vector_index_defs: list[VectorIndexDef] | None = None
22+
primary_key_fields: Sequence[str]
23+
vector_indexes: Sequence[VectorIndexDef] = ()

0 commit comments

Comments
 (0)