Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 64 additions & 12 deletions python/cocoindex/targets/lancedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import Any

import lancedb # type: ignore
from lancedb.index import FTS # type: ignore
from lancedb.index import FTS, HnswPq, IvfFlat # type: ignore
import pyarrow as pa # type: ignore

from cocoindex import op
Expand All @@ -21,7 +21,13 @@
VectorTypeSchema,
TableType,
)
from cocoindex.index import IndexOptions, VectorSimilarityMetric
from cocoindex.index import (
IndexOptions,
VectorSimilarityMetric,
VectorIndexMethod,
HnswVectorIndexMethod,
IvfFlatVectorIndexMethod,
)

_logger = logging.getLogger(__name__)

Expand All @@ -32,6 +38,43 @@
}


def _create_vector_index_config(
method: VectorIndexMethod | None, distance_type: str
) -> HnswPq | IvfFlat:
"""
Create the appropriate LanceDB index configuration based on the VectorIndexMethod.

Args:
method: The vector index method configuration, or None for default (HNSW).
distance_type: The distance metric type ('cosine', 'l2', or 'dot').

Returns:
A LanceDB index configuration object (HnswPq or IvfFlat).
"""
if method is None:
# Default to HNSW with PQ (HnswPq)
return HnswPq(distance_type=distance_type)

if isinstance(method, HnswVectorIndexMethod):
# HNSW method - use HnswPq with optional parameters
kwargs: dict[str, Any] = {"distance_type": distance_type}
if method.m is not None:
kwargs["m"] = method.m
if method.ef_construction is not None:
kwargs["ef_construction"] = method.ef_construction
return HnswPq(**kwargs)

if isinstance(method, IvfFlatVectorIndexMethod):
# IVF Flat method
kwargs = {"distance_type": distance_type}
if method.lists is not None:
kwargs["num_partitions"] = method.lists
return IvfFlat(**kwargs)

# Fallback to default
return HnswPq(distance_type=distance_type)


class DatabaseOptions:
storage_options: dict[str, Any] | None = None

Expand All @@ -47,6 +90,7 @@ class _VectorIndex:
name: str
field_name: str
metric: VectorSimilarityMetric
method: VectorIndexMethod | None = None


@dataclasses.dataclass
Expand Down Expand Up @@ -305,22 +349,28 @@ def get_setup_state(
) -> _State:
if len(key_fields_schema) != 1:
raise ValueError("LanceDB only supports a single key field")
if index_options.vector_indexes is not None:
for vector_index in index_options.vector_indexes:
if vector_index.method is not None:
raise ValueError(
"Vector index method is not configurable for LanceDB yet"
)

def _get_method_suffix(method: VectorIndexMethod | None) -> str:
"""Get a suffix string representing the index method for the index name."""
if method is None:
return "hnsw_pq" # default
if isinstance(method, HnswVectorIndexMethod):
return "hnsw_pq"
if isinstance(method, IvfFlatVectorIndexMethod):
return "ivf_flat"
return "hnsw_pq"

return _State(
key_field_schema=key_fields_schema[0],
value_fields_schema=value_fields_schema,
db_options=spec.db_options,
vector_indexes=(
[
_VectorIndex(
name=f"__{index.field_name}__{_LANCEDB_VECTOR_METRIC[index.metric]}__idx",
name=f"__{index.field_name}__{_LANCEDB_VECTOR_METRIC[index.metric]}__{_get_method_suffix(index.method)}__idx",
Copy link

Copilot AI Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The index naming strategy has changed to include the method suffix (e.g., __field__cosine__hnsw_pq__idx). This is a breaking change that will cause existing indexes to be dropped and recreated when upgrading, since the old index names don't include the method suffix.

While this may be intentional to support the new functionality, it's worth noting that:

  1. This could cause downtime during the migration as indexes are rebuilt
  2. For large datasets, index creation can be time-consuming and resource-intensive
  3. Users should be warned about this breaking change in the release notes

Consider documenting this breaking change clearly, or implementing a migration strategy that handles both old and new index naming conventions during a transition period.

Copilot uses AI. Check for mistakes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a breaking change. Let's avoid adding this suffix for the default method, so existing index name won't change.

field_name=index.field_name,
metric=index.metric,
method=index.method,
)
for index in index_options.vector_indexes
]
Expand Down Expand Up @@ -413,12 +463,14 @@ async def apply_setup_change(
unseen_prev_vector_indexes.remove(index.name)
else:
try:
# Determine index configuration based on method
index_config = _create_vector_index_config(
index.method, _LANCEDB_VECTOR_METRIC[index.metric]
)
await table.create_index(
index.field_name,
name=index.name,
config=lancedb.index.HnswPq(
distance_type=_LANCEDB_VECTOR_METRIC[index.metric]
),
config=index_config,
)
except Exception as e: # pylint: disable=broad-exception-caught
raise RuntimeError(
Expand Down
150 changes: 150 additions & 0 deletions python/cocoindex/tests/test_lancedb_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""
Test suite for LanceDB VectorIndexMethod support.

This module tests the VectorIndexMethod configuration for LanceDB target,
including HNSW and IVF Flat index types.

Note: This test file uses standalone copies of the VectorIndexMethod classes
and helper functions to avoid importing the cocoindex module, which requires
the Rust engine to be built. When running in CI, the full module will be tested.
"""
Copy link

Copilot AI Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment states these are "standalone copies... to avoid importing the cocoindex module, which requires the Rust engine to be built. When running in CI, the full module will be tested." However, there's no evidence in the test file of any conditional testing for CI that would test the actual implementation. This means the actual _create_vector_index_config function in lancedb.py is never tested by these unit tests.

Either implement the CI-specific tests that import and test the actual implementation, or restructure the tests to import from the actual modules (using pytest.importorskip if needed).

Copilot uses AI. Check for mistakes.

import pytest
from dataclasses import dataclass
from typing import Any, Union, Optional

# Skip all tests if lancedb is not installed
lancedb = pytest.importorskip("lancedb")

# Import lancedb index types for assertion
from lancedb.index import HnswPq, IvfFlat # type: ignore


# Standalone copies of VectorIndexMethod classes for testing
# These mirror the definitions in cocoindex/index.py
@dataclass
class HnswVectorIndexMethod:
"""HNSW vector index parameters."""
kind: str = "Hnsw"
m: Optional[int] = None
ef_construction: Optional[int] = None


@dataclass
class IvfFlatVectorIndexMethod:
"""IVFFlat vector index parameters."""
kind: str = "IvfFlat"
lists: Optional[int] = None
Copy link

Copilot AI Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Type annotation inconsistency: This file uses Optional[int] (lines 29, 30, 37) while the actual implementation in cocoindex/index.py uses int | None (lines 17-18, 26). For consistency with the codebase, use int | None instead of Optional[int].

Note that Optional is not imported from typing either (line 14 shows only Any, Union, Optional but Optional should be used consistently across the codebase or not at all).

Copilot uses AI. Check for mistakes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please fix this comment. Prefer int | None here.



VectorIndexMethod = Union[HnswVectorIndexMethod, IvfFlatVectorIndexMethod]


def _create_vector_index_config(
method: Optional[VectorIndexMethod],
distance_type: str,
) -> Union[HnswPq, IvfFlat]:
"""
Create the appropriate LanceDB index configuration based on the VectorIndexMethod.
This mirrors the function from lancedb.py for testing without engine dependency.
"""
if method is None:
return HnswPq(distance_type=distance_type)

if isinstance(method, HnswVectorIndexMethod):
kwargs: dict[str, Any] = {"distance_type": distance_type}
if method.m is not None:
kwargs["m"] = method.m
if method.ef_construction is not None:
kwargs["ef_construction"] = method.ef_construction
return HnswPq(**kwargs)

if isinstance(method, IvfFlatVectorIndexMethod):
kwargs = {"distance_type": distance_type}
if method.lists is not None:
kwargs["num_partitions"] = method.lists
return IvfFlat(**kwargs)

return HnswPq(distance_type=distance_type)


Copy link

Copilot AI Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The dataclass definitions and _create_vector_index_config function are duplicated from the main implementation. This creates a maintainability issue where changes to the actual implementation won't be reflected in the tests, potentially causing the tests to pass even when the actual code is broken.

Consider importing the actual classes from cocoindex.index and the function from cocoindex.targets.lancedb instead of duplicating them. If the Rust engine dependency is a concern for CI, you can use pytest.importorskip for the cocoindex module as well, or structure the tests differently to test the actual implementation.

Suggested change
from dataclasses import dataclass
from typing import Any, Union, Optional
# Skip all tests if lancedb is not installed
lancedb = pytest.importorskip("lancedb")
# Import lancedb index types for assertion
from lancedb.index import HnswPq, IvfFlat # type: ignore
# Standalone copies of VectorIndexMethod classes for testing
# These mirror the definitions in cocoindex/index.py
@dataclass
class HnswVectorIndexMethod:
"""HNSW vector index parameters."""
kind: str = "Hnsw"
m: Optional[int] = None
ef_construction: Optional[int] = None
@dataclass
class IvfFlatVectorIndexMethod:
"""IVFFlat vector index parameters."""
kind: str = "IvfFlat"
lists: Optional[int] = None
VectorIndexMethod = Union[HnswVectorIndexMethod, IvfFlatVectorIndexMethod]
def _create_vector_index_config(
method: Optional[VectorIndexMethod],
distance_type: str,
) -> Union[HnswPq, IvfFlat]:
"""
Create the appropriate LanceDB index configuration based on the VectorIndexMethod.
This mirrors the function from lancedb.py for testing without engine dependency.
"""
if method is None:
return HnswPq(distance_type=distance_type)
if isinstance(method, HnswVectorIndexMethod):
kwargs: dict[str, Any] = {"distance_type": distance_type}
if method.m is not None:
kwargs["m"] = method.m
if method.ef_construction is not None:
kwargs["ef_construction"] = method.ef_construction
return HnswPq(**kwargs)
if isinstance(method, IvfFlatVectorIndexMethod):
kwargs = {"distance_type": distance_type}
if method.lists is not None:
kwargs["num_partitions"] = method.lists
return IvfFlat(**kwargs)
return HnswPq(distance_type=distance_type)
# Skip all tests if lancedb is not installed
lancedb = pytest.importorskip("lancedb")
# Skip tests if cocoindex (which requires the Rust engine) is not available
pytest.importorskip("cocoindex")
# Import lancedb index types for assertion
from lancedb.index import HnswPq, IvfFlat # type: ignore
# Import actual VectorIndexMethod classes and configuration function
from cocoindex.index import HnswVectorIndexMethod, IvfFlatVectorIndexMethod
from cocoindex.targets.lancedb import _create_vector_index_config

Copilot uses AI. Check for mistakes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please fix this comment to avoid duplication

class TestCreateVectorIndexConfig:
"""Test suite for _create_vector_index_config function."""

def test_default_method_returns_hnsw_pq(self) -> None:
"""Test that None method defaults to HnswPq."""
config = _create_vector_index_config(None, "cosine")
assert isinstance(config, HnswPq)
assert config.distance_type == "cosine"

def test_hnsw_method_without_params(self) -> None:
"""Test HNSW method without custom parameters."""
method = HnswVectorIndexMethod()
config = _create_vector_index_config(method, "l2")
assert isinstance(config, HnswPq)
assert config.distance_type == "l2"

def test_hnsw_method_with_m_param(self) -> None:
"""Test HNSW method with custom m parameter."""
method = HnswVectorIndexMethod(m=32)
config = _create_vector_index_config(method, "cosine")
assert isinstance(config, HnswPq)
assert config.m == 32

def test_hnsw_method_with_ef_construction(self) -> None:
"""Test HNSW method with custom ef_construction parameter."""
method = HnswVectorIndexMethod(ef_construction=200)
config = _create_vector_index_config(method, "dot")
assert isinstance(config, HnswPq)
assert config.ef_construction == 200

def test_hnsw_method_with_all_params(self) -> None:
"""Test HNSW method with both m and ef_construction parameters."""
method = HnswVectorIndexMethod(m=16, ef_construction=100)
config = _create_vector_index_config(method, "cosine")
assert isinstance(config, HnswPq)
assert config.m == 16
assert config.ef_construction == 100

def test_ivf_flat_method_without_params(self) -> None:
"""Test IVF Flat method without custom parameters."""
method = IvfFlatVectorIndexMethod()
config = _create_vector_index_config(method, "l2")
assert isinstance(config, IvfFlat)
assert config.distance_type == "l2"

def test_ivf_flat_method_with_lists_param(self) -> None:
"""Test IVF Flat method with custom lists (num_partitions) parameter."""
method = IvfFlatVectorIndexMethod(lists=256)
config = _create_vector_index_config(method, "cosine")
assert isinstance(config, IvfFlat)
assert config.num_partitions == 256

def test_all_distance_types(self) -> None:
"""Test that all distance types work correctly."""
for distance_type in ["cosine", "l2", "dot"]:
config = _create_vector_index_config(None, distance_type)
assert config.distance_type == distance_type


class TestVectorIndexMethodIntegration:
"""Integration tests for VectorIndexMethod with LanceDB."""

def test_hnsw_method_kind_attribute(self) -> None:
"""Test that HnswVectorIndexMethod has correct kind attribute."""
method = HnswVectorIndexMethod()
assert method.kind == "Hnsw"

def test_ivf_flat_method_kind_attribute(self) -> None:
"""Test that IvfFlatVectorIndexMethod has correct kind attribute."""
method = IvfFlatVectorIndexMethod()
assert method.kind == "IvfFlat"

def test_method_default_values(self) -> None:
"""Test default values for index methods."""
hnsw = HnswVectorIndexMethod()
assert hnsw.m is None
assert hnsw.ef_construction is None

ivf_flat = IvfFlatVectorIndexMethod()
assert ivf_flat.lists is None
Loading