Skip to content

Commit 9013853

Browse files
authored
Add metadata tests for type-erased indexes in Python (#334)
1 parent 83bcc2b commit 9013853

File tree

4 files changed

+32
-63
lines changed

4 files changed

+32
-63
lines changed

apis/python/src/tiledb/vector_search/ingestion.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from tiledb.vector_search.storage_formats import STORAGE_VERSION
1010
from tiledb.vector_search.storage_formats import validate_storage_version
1111
from tiledb.vector_search.utils import add_to_group
12+
from tiledb.vector_search.utils import is_type_erased_index
1213

1314

1415
class TrainingSamplingPolicy(enum.Enum):
@@ -290,9 +291,6 @@ def ingest(
290291
DEFAULT_IMG_NAME = "3.9-vectorsearch"
291292
MAX_INT32 = 2**31 - 1
292293

293-
def is_type_erased_index():
294-
return index_type == "VAMANA"
295-
296294
class SourceType(enum.Enum):
297295
"""SourceType of input vectors"""
298296

@@ -755,7 +753,7 @@ def create_arrays(
755753

756754
# Note that we don't create type-erased indexes (i.e. Vamana) here. Instead we create them
757755
# at very start of ingest() in C++.
758-
elif not is_type_erased_index():
756+
elif not is_type_erased_index(index_type):
759757
raise ValueError(f"Not supported index_type {index_type}")
760758

761759
def read_external_ids(
@@ -2825,7 +2823,7 @@ def consolidate_and_vacuum(
28252823
temp_size = int(group.meta.get("temp_size", "0"))
28262824
group.close()
28272825

2828-
if not is_type_erased_index():
2826+
if not is_type_erased_index(index_type):
28292827
# For type-erased indexes (i.e. Vamana), we update this metadata in the write_index()
28302828
# call during create_ingestion_dag(), so don't do it here.
28312829
group = tiledb.Group(index_group_uri, "w")

apis/python/src/tiledb/vector_search/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
import tiledb
66

77

8+
def is_type_erased_index(index_type: str) -> bool:
9+
return index_type == "VAMANA"
10+
11+
812
def add_to_group(group, uri, name):
913
"""
1014
Adds an object to a group. Automatically infers whether to use a relative path or absolute path.

apis/python/test/test_index.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import time
23

34
import numpy as np
45
import pytest
@@ -15,6 +16,7 @@
1516
from tiledb.vector_search.index import create_metadata
1617
from tiledb.vector_search.ingestion import ingest
1718
from tiledb.vector_search.ivf_flat_index import IVFFlatIndex
19+
from tiledb.vector_search.utils import is_type_erased_index
1820
from tiledb.vector_search.utils import load_fvecs
1921
from tiledb.vector_search.vamana_index import VamanaIndex
2022

@@ -55,16 +57,31 @@ def check_default_metadata(
5557
assert type(group.meta["index_type"]) == str
5658

5759
assert "base_sizes" in group.meta
58-
assert group.meta["base_sizes"] == json.dumps([0])
60+
if is_type_erased_index(expected_index_type):
61+
# NOTE(paris): Type-erased indexes have two values upon creation.
62+
assert group.meta["base_sizes"] == "[0,0]"
63+
else:
64+
assert group.meta["base_sizes"] == json.dumps([0])
5965
assert type(group.meta["base_sizes"]) == str
6066

6167
assert "ingestion_timestamps" in group.meta
62-
assert group.meta["ingestion_timestamps"] == json.dumps([0])
68+
if is_type_erased_index(expected_index_type):
69+
# NOTE(paris): Type-erased indexes have two values upon creation.
70+
ingestion_timestamps = json.loads(group.meta["ingestion_timestamps"])
71+
assert len(ingestion_timestamps) == 2
72+
assert ingestion_timestamps[0] == 0
73+
current_time_ms = int(time.time() * 1000)
74+
assert ingestion_timestamps[1] < current_time_ms
75+
assert ingestion_timestamps[1] > current_time_ms - 1000 * 5
76+
else:
77+
assert group.meta["ingestion_timestamps"] == json.dumps([0])
6378
assert type(group.meta["ingestion_timestamps"]) == str
6479

65-
assert "has_updates" in group.meta
66-
assert group.meta["has_updates"] == 0
67-
assert type(group.meta["has_updates"]) == np.int64
80+
if not is_type_erased_index(expected_index_type):
81+
# NOTE(paris): Type-erased indexes do not write has_updates.
82+
assert "has_updates" in group.meta
83+
assert group.meta["has_updates"] == 0
84+
assert type(group.meta["has_updates"]) == np.int64
6885

6986

7087
def test_flat_index(tmp_path):
@@ -213,6 +230,7 @@ def test_vamana_index(tmp_path):
213230
query_and_check_distances(
214231
index, queries, 1, [[ind.MAX_FLOAT_32]], [[ind.MAX_UINT64]]
215232
)
233+
check_default_metadata(uri, vector_type, STORAGE_VERSION, "VAMANA")
216234

217235
update_vectors = np.empty([5], dtype=object)
218236
update_vectors[0] = np.array([0, 0, 0], dtype=np.dtype(np.float32))

src/include/index/index_metadata.h

Lines changed: 2 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ class base_index_metadata {
112112
using metadata_string_check_type =
113113
std::tuple<std::string, std::string&, bool>;
114114
std::vector<metadata_string_check_type> metadata_string_checks{
115-
// name, member_variable, default, expected, required
115+
// name, member_variable, required
116116
{"dataset_type", dataset_type_, true},
117117
{"storage_version", storage_version_, true},
118118
{"dtype", dtype_, false},
@@ -127,11 +127,8 @@ class base_index_metadata {
127127
using metadata_arithmetic_check_type =
128128
std::tuple<std::string, void*, tiledb_datatype_t, bool>;
129129
std::vector<metadata_arithmetic_check_type> metadata_arithmetic_checks{
130+
// name, member_variable, type, required
130131
{"temp_size", &temp_size_, TILEDB_INT64, true},
131-
//{"index_kind",
132-
// nstatic_cast<IndexMetadata*>(this)->index_kind_,
133-
// TILEDB_UINT64,
134-
// false},
135132
{"dimension", &dimension_, TILEDB_UINT32, false},
136133
{"feature_datatype", &feature_datatype_, TILEDB_UINT32, false},
137134
{"id_datatype", &id_datatype_, TILEDB_UINT32, false},
@@ -339,54 +336,6 @@ class base_index_metadata {
339336
}
340337
}
341338

342-
#if 0
343-
/**************************************************************************
344-
* Getters and setters
345-
**************************************************************************/
346-
std::string base_sizes_str() const {
347-
return base_sizes_str_;
348-
}
349-
auto& base_sizes() {
350-
return base_sizes_;
351-
}
352-
auto& base_sizes() const {
353-
return base_sizes_;
354-
}
355-
void set_base_sizes(const std::vector<base_sizes_type>& base_sizes) {
356-
base_sizes_ = base_sizes;
357-
}
358-
auto storage_version() const {
359-
return storage_version_;
360-
}
361-
auto& storage_version() {
362-
return storage_version_;
363-
}
364-
auto dtype() const {
365-
return dtype_;
366-
}
367-
auto& dtype() {
368-
return dtype_;
369-
}
370-
auto feature_datatype() const {
371-
return feature_datatype_;
372-
}
373-
auto& feature_datatype() {
374-
return feature_datatype_;
375-
}
376-
auto id_datatype() const {
377-
return id_datatype_;
378-
}
379-
auto& id_datatype() {
380-
return id_datatype_;
381-
}
382-
auto px_datatype() const {
383-
return px_datatype_;
384-
}
385-
auto& px_datatype() {
386-
return px_datatype_;
387-
}
388-
#endif
389-
390339
/**************************************************************************
391340
* Helpful functions for debugging, testing, etc
392341
**************************************************************************/

0 commit comments

Comments
 (0)