Test that we can index different storage versions and then query them (#168)

jparismorgan · web-flow · commit 60fc836157a5 · 2023-12-14T15:57:24.000+01:00
diff --git a/apis/python/src/tiledb/vector_search/flat_index.py b/apis/python/src/tiledb/vector_search/flat_index.py
@@ -6,7 +6,8 @@
 from tiledb.vector_search import index
 from tiledb.vector_search.module import *
 from tiledb.vector_search.storage_formats import (STORAGE_VERSION,
-                                                  storage_formats)
+                                                  storage_formats,
+                                                  validate_storage_version)
 
 MAX_INT32 = np.iinfo(np.dtype("int32")).max
 TILE_SIZE_BYTES = 128000000  # 128MB
@@ -119,21 +120,25 @@ def create(
     vector_type: np.dtype,
     group_exists: bool = False,
     config: Optional[Mapping[str, Any]] = None,
+    storage_version: str = STORAGE_VERSION,
     **kwargs,
 ) -> FlatIndex:
+    validate_storage_version(storage_version)
+
     index.create_metadata(
         uri=uri,
         dimensions=dimensions,
         vector_type=vector_type,
         index_type=INDEX_TYPE,
+        storage_version=storage_version,
         group_exists=group_exists,
         config=config,
     )
     with tiledb.scope_ctx(ctx_or_config=config):
         group = tiledb.Group(uri, "w")
         tile_size = TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions
-        ids_array_name = storage_formats[STORAGE_VERSION]["IDS_ARRAY_NAME"]
-        parts_array_name = storage_formats[STORAGE_VERSION]["PARTS_ARRAY_NAME"]
+        ids_array_name = storage_formats[storage_version]["IDS_ARRAY_NAME"]
+        parts_array_name = storage_formats[storage_version]["PARTS_ARRAY_NAME"]
         ids_uri = f"{uri}/{ids_array_name}"
         parts_uri = f"{uri}/{parts_array_name}"
 
@@ -147,7 +152,7 @@ def create(
         ids_attr = tiledb.Attr(
             name="values",
             dtype=np.dtype(np.uint64),
-            filters=storage_formats[STORAGE_VERSION]["DEFAULT_ATTR_FILTERS"],
+            filters=storage_formats[storage_version]["DEFAULT_ATTR_FILTERS"],
         )
         ids_schema = tiledb.ArraySchema(
             domain=ids_array_dom,
@@ -175,7 +180,7 @@ def create(
         parts_attr = tiledb.Attr(
             name="values",
             dtype=vector_type,
-            filters=storage_formats[STORAGE_VERSION]["DEFAULT_ATTR_FILTERS"],
+            filters=storage_formats[storage_version]["DEFAULT_ATTR_FILTERS"],
         )
         parts_schema = tiledb.ArraySchema(
             domain=parts_array_dom,
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -489,6 +489,7 @@ def create_metadata(
     dimensions: int,
     vector_type: np.dtype,
     index_type: str,
+    storage_version: str,
     group_exists: bool = False,
     config: Optional[Mapping[str, Any]] = None,
 ):
@@ -501,7 +502,7 @@ def create_metadata(
         group = tiledb.Group(uri, "w")
         group.meta["dataset_type"] = DATASET_TYPE
         group.meta["dtype"] = np.dtype(vector_type).name
-        group.meta["storage_version"] = STORAGE_VERSION
+        group.meta["storage_version"] = storage_version
         group.meta["index_type"] = index_type
         group.meta["base_sizes"] = json.dumps([0])
         group.meta["ingestion_timestamps"] = json.dumps([0])
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
@@ -6,7 +6,7 @@
 from tiledb.cloud.dag import Mode
 
 from tiledb.vector_search._tiledbvspy import *
-from tiledb.vector_search.storage_formats import STORAGE_VERSION
+from tiledb.vector_search.storage_formats import STORAGE_VERSION, validate_storage_version
 
 
 def ingest(
@@ -88,7 +88,7 @@ def ingest(
         Max number of tasks per execution stage of ingestion,
         if not provided, is auto-configured
     storage_version: str
-        Vector index storage format version.
+        Vector index storage format version. If not provided, defaults to the latest version.
     verbose: bool
         verbose logging, defaults to False
     trace_id: Optional[str]
@@ -119,6 +119,8 @@ def ingest(
     from tiledb.vector_search.index import Index
     from tiledb.vector_search.storage_formats import storage_formats
 
+    validate_storage_version(storage_version)
+
     # use index_group_uri for internal clarity
     index_group_uri = index_uri
 
@@ -355,6 +357,7 @@ def create_arrays(
         input_vectors_work_items: int,
         vector_type: np.dtype,
         logger: logging.Logger,
+        storage_version: str,
     ) -> None:
         if index_type == "FLAT":
             if not arrays_created:
@@ -364,6 +367,7 @@ def create_arrays(
                     vector_type=vector_type,
                     group_exists=True,
                     config=config,
+                    storage_version=storage_version
                 )
         elif index_type == "IVF_FLAT":
             if not arrays_created:
@@ -373,6 +377,7 @@ def create_arrays(
                     vector_type=vector_type,
                     group_exists=True,
                     config=config,
+                    storage_version=storage_version
                 )
             tile_size = int(
                 ivf_flat_index.TILE_SIZE_BYTES
@@ -1935,6 +1940,7 @@ def consolidate_and_vacuum(
             input_vectors_work_items=input_vectors_work_items,
             vector_type=vector_type,
             logger=logger,
+            storage_version=storage_version
         )
         group.meta["temp_size"] = size
         group.close()
diff --git a/apis/python/src/tiledb/vector_search/ivf_flat_index.py b/apis/python/src/tiledb/vector_search/ivf_flat_index.py
@@ -8,7 +8,8 @@
 from tiledb.vector_search import index
 from tiledb.vector_search.module import *
 from tiledb.vector_search.storage_formats import (STORAGE_VERSION,
-                                                  storage_formats)
+                                                  storage_formats,
+                                                  validate_storage_version)
 
 MAX_INT32 = np.iinfo(np.dtype("int32")).max
 TILE_SIZE_BYTES = 64000000  # 64MB
@@ -450,24 +451,28 @@ def create(
     vector_type: np.dtype,
     group_exists: bool = False,
     config: Optional[Mapping[str, Any]] = None,
+    storage_version: str = STORAGE_VERSION,
     **kwargs,
 ) -> IVFFlatIndex:
+    validate_storage_version(storage_version)
+
     index.create_metadata(
         uri=uri,
         dimensions=dimensions,
         vector_type=vector_type,
         index_type=INDEX_TYPE,
+        storage_version=storage_version,
         group_exists=group_exists,
         config=config,
     )
     with tiledb.scope_ctx(ctx_or_config=config):
         group = tiledb.Group(uri, "w")
         tile_size = int(TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions)
         group.meta["partition_history"] = json.dumps([0])
-        centroids_array_name = storage_formats[STORAGE_VERSION]["CENTROIDS_ARRAY_NAME"]
-        index_array_name = storage_formats[STORAGE_VERSION]["INDEX_ARRAY_NAME"]
-        ids_array_name = storage_formats[STORAGE_VERSION]["IDS_ARRAY_NAME"]
-        parts_array_name = storage_formats[STORAGE_VERSION]["PARTS_ARRAY_NAME"]
+        centroids_array_name = storage_formats[storage_version]["CENTROIDS_ARRAY_NAME"]
+        index_array_name = storage_formats[storage_version]["INDEX_ARRAY_NAME"]
+        ids_array_name = storage_formats[storage_version]["IDS_ARRAY_NAME"]
+        parts_array_name = storage_formats[storage_version]["PARTS_ARRAY_NAME"]
         centroids_uri = f"{uri}/{centroids_array_name}"
         index_array_uri = f"{uri}/{index_array_name}"
         ids_uri = f"{uri}/{ids_array_name}"
@@ -491,7 +496,7 @@ def create(
         centroids_attr = tiledb.Attr(
             name="centroids",
             dtype=np.dtype(np.float32),
-            filters=storage_formats[STORAGE_VERSION]["DEFAULT_ATTR_FILTERS"],
+            filters=storage_formats[storage_version]["DEFAULT_ATTR_FILTERS"],
         )
         centroids_schema = tiledb.ArraySchema(
             domain=centroids_array_dom,
@@ -513,7 +518,7 @@ def create(
         index_attr = tiledb.Attr(
             name="values",
             dtype=np.dtype(np.uint64),
-            filters=storage_formats[STORAGE_VERSION]["DEFAULT_ATTR_FILTERS"],
+            filters=storage_formats[storage_version]["DEFAULT_ATTR_FILTERS"],
         )
         index_schema = tiledb.ArraySchema(
             domain=index_array_dom,
@@ -535,7 +540,7 @@ def create(
         ids_attr = tiledb.Attr(
             name="values",
             dtype=np.dtype(np.uint64),
-            filters=storage_formats[STORAGE_VERSION]["DEFAULT_ATTR_FILTERS"],
+            filters=storage_formats[storage_version]["DEFAULT_ATTR_FILTERS"],
         )
         ids_schema = tiledb.ArraySchema(
             domain=ids_array_dom,
@@ -563,7 +568,7 @@ def create(
         parts_attr = tiledb.Attr(
             name="values",
             dtype=vector_type,
-            filters=storage_formats[STORAGE_VERSION]["DEFAULT_ATTR_FILTERS"],
+            filters=storage_formats[storage_version]["DEFAULT_ATTR_FILTERS"],
         )
         parts_schema = tiledb.ArraySchema(
             domain=parts_array_dom,
diff --git a/apis/python/src/tiledb/vector_search/storage_formats.py b/apis/python/src/tiledb/vector_search/storage_formats.py
@@ -40,3 +40,8 @@
 }
 
 STORAGE_VERSION = "0.3"
+
+def validate_storage_version(storage_version):
+    if storage_version not in storage_formats:
+        valid_versions = ', '.join(storage_formats.keys())
+        raise ValueError(f"Invalid storage version: {storage_version}. Valid versions are: [{valid_versions}]")
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -1,7 +1,8 @@
 import numpy as np
 from common import *
-from tiledb.cloud.dag import Mode
+import pytest
 
+from tiledb.cloud.dag import Mode
 from tiledb.vector_search.flat_index import FlatIndex
 from tiledb.vector_search.index import Index
 from tiledb.vector_search.ingestion import ingest
@@ -416,6 +417,7 @@ def test_ivf_flat_ingestion_with_batch_updates(tmp_path):
     _, result = index.query(query_vectors, k=k, nprobe=nprobe)
     assert accuracy(result, gt_i, updated_ids=updated_ids) > 0.99
 
+
 def test_ivf_flat_ingestion_with_updates_and_timetravel(tmp_path):
     dataset_dir = os.path.join(tmp_path, "dataset")
     index_uri = os.path.join(tmp_path, "array")
@@ -669,6 +671,73 @@ def test_ivf_flat_ingestion_with_additions_and_timetravel(tmp_path):
     _, result = index.query(query_vectors, k=k, nprobe=index.partitions)
     assert 0.45 < accuracy(result, gt_i) < 0.55
 
+
+def test_storage_versions(tmp_path):
+    dataset_dir = os.path.join(tmp_path, "dataset")
+    k = 10
+    size = 1000
+    partitions = 10
+    dimensions = 128
+    nqueries = 100
+    data = create_random_dataset_u8(nb=size, d=dimensions, nq=nqueries, k=k, path=dataset_dir)
+    source_uri = os.path.join(dataset_dir, "data.u8bin")
+
+    dtype = np.uint8
+    query_vectors = get_queries(dataset_dir, dtype=dtype)
+    gt_i, _ = get_groundtruth(dataset_dir, k)
+    
+    indexes = ["FLAT", "IVF_FLAT"]
+    index_classes = [FlatIndex, IVFFlatIndex]
+    index_files = [tiledb.vector_search.flat_index, tiledb.vector_search.ivf_flat_index]
+    for index_type, index_class, index_file in zip(indexes, index_classes, index_files):
+        # First we test with an invalid storage version.
+        with pytest.raises(ValueError) as error:
+            index_uri = os.path.join(tmp_path, f"array_{index_type}_invalid")
+            ingest(
+                index_type=index_type,
+                index_uri=index_uri,
+                source_uri=source_uri,
+                partitions=partitions,
+                storage_version="Foo"
+            )
+        assert "Invalid storage version" in str(error.value)
+
+        with pytest.raises(ValueError) as error:
+            index_file.create(uri=index_uri, dimensions=3, vector_type=np.dtype(dtype), storage_version="Foo")
+        assert "Invalid storage version" in str(error.value)
+
+        # Then we test with valid storage versions.
+        for storage_version, _ in tiledb.vector_search.storage_formats.items():
+            index_uri = os.path.join(tmp_path, f"array_{index_type}_{storage_version}")
+            index = ingest(
+                index_type=index_type,
+                index_uri=index_uri,
+                source_uri=source_uri,
+                partitions=partitions,
+                storage_version=storage_version
+            )
+            _, result = index.query(query_vectors, k=k)
+            assert accuracy(result, gt_i) >= MINIMUM_ACCURACY
+
+            update_ids_offset = MAX_UINT64 - size
+            updated_ids = {}
+            for i in range(10):
+                index.delete(external_id=i)
+                index.update(vector=data[i].astype(dtype), external_id=i + update_ids_offset)
+                updated_ids[i] = i + update_ids_offset
+
+            _, result = index.query(query_vectors, k=k)
+            assert accuracy(result, gt_i, updated_ids=updated_ids) >= MINIMUM_ACCURACY
+
+            index = index.consolidate_updates(partitions=20)
+            _, result = index.query(query_vectors, k=k)
+            assert accuracy(result, gt_i, updated_ids=updated_ids) >= MINIMUM_ACCURACY
+
+            index_ram = index_class(uri=index_uri)
+            _, result = index_ram.query(query_vectors, k=k)
+            assert accuracy(result, gt_i) > MINIMUM_ACCURACY
+
+
 def test_kmeans():
     k = 128
     d = 16

Original file line number	Diff line number	Diff line change
`@@ -40,3 +40,8 @@`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`STORAGE_VERSION = "0.3"`
	`43`	`+`
	`44`	`+def validate_storage_version(storage_version):`
	`45`	`+ if storage_version not in storage_formats:`
	`46`	`+ valid_versions = ', '.join(storage_formats.keys())`
	`47`	`+ raise ValueError(f"Invalid storage version: {storage_version}. Valid versions are: [{valid_versions}]")`