Skip to content

Commit b0c5208

Browse files
authored
Run backwards compatibility unit tests in C++ and read types metadata in ivf_flat_index.h (#270)
1 parent 951804d commit b0c5208

File tree

8 files changed

+220
-32
lines changed

8 files changed

+220
-32
lines changed

src/include/api/ivf_flat_index.h

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "api/feature_vector.h"
4242
#include "api/feature_vector_array.h"
4343
#include "api_defs.h"
44+
#include "index/ivf_flat_group.h"
4445
#include "index/ivf_flat_index.h"
4546
#include "tiledb/group_experimental.h"
4647

@@ -138,27 +139,55 @@ class IndexIVFFlat {
138139
const tiledb::Context& ctx,
139140
const URI& group_uri,
140141
const std::optional<IndexOptions>& config = std::nullopt) {
141-
using metadata_element = std::tuple<std::string, void*, tiledb_datatype_t>;
142+
// Holds {metadata_name, address, datatype, array_key}
143+
using metadata_element =
144+
std::tuple<std::string, void*, tiledb_datatype_t, std::string>;
142145
std::vector<metadata_element> metadata{
143-
{"feature_datatype", &feature_datatype_, TILEDB_UINT32},
144-
{"id_datatype", &id_datatype_, TILEDB_UINT32},
145-
{"px_datatype", &px_datatype_, TILEDB_UINT32}};
146+
{"feature_datatype",
147+
&feature_datatype_,
148+
TILEDB_UINT32,
149+
"parts_array_name"},
150+
{"id_datatype", &id_datatype_, TILEDB_UINT32, "ids_array_name"},
151+
{"px_datatype", &px_datatype_, TILEDB_UINT32, "index_array_name"}};
146152

147153
tiledb::Config cfg;
148154
tiledb::Group read_group(ctx, group_uri, TILEDB_READ, cfg);
149155

150-
for (auto& [name, value, datatype] : metadata) {
151-
if (!read_group.has_metadata(name, &datatype)) {
152-
throw std::runtime_error("Missing metadata: " + name);
156+
// Get the storage_version in case the metadata is not present on read_group
157+
// and we need to read the individual arrays.
158+
std::string storage_version = current_storage_version;
159+
tiledb_datatype_t v_type;
160+
if (read_group.has_metadata("storage_version", &v_type)) {
161+
uint32_t v_num;
162+
const void* v;
163+
read_group.get_metadata("storage_version", &v_type, &v_num, &v);
164+
if (v_type == TILEDB_STRING_ASCII || v_type == TILEDB_STRING_UTF8) {
165+
storage_version = std::string(static_cast<const char*>(v), v_num);
153166
}
154-
uint32_t count;
155-
void* addr;
156-
read_group.get_metadata(name, &datatype, &count, (const void**)&addr);
157-
if (datatype == TILEDB_UINT32) {
158-
*reinterpret_cast<uint32_t*>(value) =
159-
*reinterpret_cast<uint32_t*>(addr);
167+
}
168+
169+
for (auto& [name, value, datatype, array_key] : metadata) {
170+
// We first try to read metadata from the group.
171+
if (read_group.has_metadata(name, &datatype)) {
172+
uint32_t count;
173+
void* addr;
174+
read_group.get_metadata(name, &datatype, &count, (const void**)&addr);
175+
if (datatype == TILEDB_UINT32) {
176+
*reinterpret_cast<uint32_t*>(value) =
177+
*reinterpret_cast<uint32_t*>(addr);
178+
} else {
179+
throw std::runtime_error(
180+
"Unsupported datatype for metadata: " + name);
181+
}
160182
} else {
161-
throw std::runtime_error("Unsupported datatype for metadata: " + name);
183+
// If it is not present then fallback to checking the type on the array
184+
// directly.
185+
auto uri = array_name_to_uri(
186+
group_uri,
187+
array_key_to_array_name_from_map(
188+
storage_formats[storage_version], array_key));
189+
tiledb::ArraySchema schema(ctx, uri);
190+
*reinterpret_cast<uint32_t*>(value) = schema.attribute(0).type();
162191
}
163192
}
164193

src/include/detail/linalg/matrix.h

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -501,19 +501,29 @@ void debug_slice(
501501
const std::string& msg = "",
502502
size_t rows = 6,
503503
size_t cols = 18) {
504-
if (matrix_printf || true) {
505-
rows = std::min(rows, dimension(A));
506-
cols = std::min(cols, num_vectors(A));
507-
508-
std::cout << "# " << msg << std::endl;
509-
for (size_t i = 0; i < rows; ++i) {
510-
std::cout << "# ";
511-
for (size_t j = 0; j < cols; ++j) {
512-
std::cout << (float)A(i, j) << "\t";
513-
}
514-
std::cout << std::endl;
504+
auto max_size = 10;
505+
auto rowsEnd = std::min(dimension(A), static_cast<size_t>(max_size));
506+
auto colsEnd = std::min(num_vectors(A), static_cast<size_t>(max_size));
507+
508+
std::cout << "# " << msg << std::endl;
509+
for (size_t i = 0; i < rowsEnd; ++i) {
510+
std::cout << "# ";
511+
for (size_t j = 0; j < colsEnd; ++j) {
512+
std::cout << (float)A(i, j) << " ";
513+
}
514+
if (A.num_cols() > max_size) {
515+
std::cout << "...";
515516
}
517+
std::cout << std::endl;
516518
}
519+
if (A.num_rows() > max_size) {
520+
std::cout << "# ..." << std::endl;
521+
}
522+
}
523+
524+
template <feature_vector_array M>
525+
void debug(const M& A, const std::string& msg = "") {
526+
debug_slice(A, msg, dimension(A), num_vectors(A));
517527
}
518528

519529
template <class Matrix1, class Matrix2>

src/include/index/index_defs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ enum class IndexKind {
7979
[[maybe_unused]] static std::string current_storage_version{"0.3"};
8080

8181
// @todo Use enum for key rather than string?
82-
using StorageFormat = std::map<std::string, std::map<std::string, std::string>>;
82+
using StorageFormat =
83+
std::map<std::string, std::unordered_map<std::string, std::string>>;
8384
[[maybe_unused]] static StorageFormat storage_formats = {
8485
{"0.1",
8586
{

src/include/index/index_group.h

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,24 @@
5353
#include "mdspan/mdspan.hpp"
5454
#include "tdb_defs.h"
5555

56+
/** Lookup an array name given an array key */
57+
inline std::string array_key_to_array_name_from_map(
58+
const std::unordered_map<std::string, std::string>& map,
59+
const std::string& array_key) {
60+
if (map.find(array_key) == map.end()) {
61+
throw std::runtime_error("Invalid array key in map: " + array_key);
62+
}
63+
auto tmp = *map.find(array_key);
64+
return tmp.second;
65+
};
66+
67+
/** Convert an array name to a uri. */
68+
inline std::string array_name_to_uri(
69+
const std::string& group_uri, const std::string& array_name) {
70+
return (std::filesystem::path{group_uri} / std::filesystem::path{array_name})
71+
.string();
72+
}
73+
5674
template <class Index>
5775
class base_index_metadata;
5876

@@ -119,8 +137,7 @@ class base_index_group {
119137
if (!is_valid_key_name(array_key)) {
120138
throw std::runtime_error("Invalid array key: " + array_key);
121139
}
122-
auto tmp = *array_name_map_.find(array_key);
123-
return tmp.second;
140+
return array_key_to_array_name_from_map(array_name_map_, array_key);
124141
};
125142

126143
/** Create the set of valid key names and array names */
@@ -279,9 +296,7 @@ class base_index_group {
279296
/** Convert an array name to a uri. */
280297
constexpr std::string array_name_to_uri(
281298
const std::string& array_name) const noexcept {
282-
return (std::filesystem::path{group_uri_} /
283-
std::filesystem::path{array_name})
284-
.string();
299+
return array_name_to_uri(group_uri_, array_name);
285300
}
286301

287302
/** Convert an array key to a uri. */

src/include/index/index_metadata.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@ class base_index_metadata {
158158
* @return
159159
*/
160160
auto check_string_metadata(
161-
tiledb::Group& read_group, const metadata_string_check_type& check) {
161+
tiledb::Group& read_group,
162+
const metadata_string_check_type& check) const {
162163
tiledb_datatype_t v_type;
163164
uint32_t v_num;
164165
const void* v;

src/include/test/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ kmeans_add_test(unit_api_vamana_index)
7878

7979
kmeans_add_test(unit_array_defs)
8080

81+
kmeans_add_test(unit_backwards_compatibility)
82+
8183
kmeans_add_test(unit_concepts)
8284

8385
kmeans_add_test(unit_concepts_vs)

src/include/test/array_defs.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ static std::string test_data_root =
6969
static std::string test_array_root{test_data_root / "arrays"};
7070
static std::string test_file_root{test_data_root / "files"};
7171
static std::string nano_root{test_data_root / "nano"};
72+
static std::string backwards_compatibility_root =
73+
(cmake_source_dir.parent_path() / "backwards-compatibility-data").string();
7274

7375
/**
7476
* @brief Array URIs for arrays used for unit testing of IVF indexes.
@@ -268,6 +270,9 @@ static std::string siftsmall_inputs_file{
268270
static std::string siftsmall_query_file{siftsmall_files_root / "queries.fvecs"};
269271
static std::string siftsmall_groundtruth_file{
270272
siftsmall_files_root / "groundtruth.ivecs"};
273+
// Used for backwards compatability:
274+
static std::string siftmicro_inputs_file{
275+
backwards_compatibility_root / "siftmicro_base.fvecs"};
271276

272277
/**
273278
* @brief Data files used in unit tests in the DiskANN git repo.
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/**
2+
* @file unit_backwards_compatibility.cc
3+
*
4+
* @section LICENSE
5+
*
6+
* The MIT License
7+
*
8+
* @copyright Copyright (c) 2023 TileDB, Inc.
9+
*
10+
* Permission is hereby granted, free of charge, to any person obtaining a copy
11+
* of this software and associated documentation files (the "Software"), to deal
12+
* in the Software without restriction, including without limitation the rights
13+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14+
* copies of the Software, and to permit persons to whom the Software is
15+
* furnished to do so, subject to the following conditions:
16+
*
17+
* The above copyright notice and this permission notice shall be included in
18+
* all copies or substantial portions of the Software.
19+
*
20+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26+
* THE SOFTWARE.
27+
*
28+
* @section DESCRIPTION
29+
*
30+
* Test that the current codebase can read and query old indexes.
31+
*
32+
*/
33+
34+
#include <catch2/catch_all.hpp>
35+
#include <iostream>
36+
#include "api/feature_vector_array.h"
37+
#include "api/flat_l2_index.h"
38+
#include "api/ivf_flat_index.h"
39+
#include "api/vamana_index.h"
40+
#include "array_defs.h"
41+
#include "detail/linalg/matrix.h"
42+
#include "index/flat_l2_index.h"
43+
#include "index/ivf_flat_index.h"
44+
#include "index/vamana_index.h"
45+
#include "mdspan/mdspan.hpp"
46+
#include "utils/print_types.h"
47+
48+
TEST_CASE("backwards_compatibility: test test", "[backwards_compatibility]") {
49+
REQUIRE(true);
50+
}
51+
52+
TEST_CASE(
53+
"backwards_compatibility: test_query_old_indices",
54+
"[backwards_compatibility]") {
55+
tiledb::Context ctx;
56+
std::string datasets_path = backwards_compatibility_root / "data";
57+
auto base =
58+
read_bin_local<siftsmall_feature_type>(ctx, siftmicro_inputs_file);
59+
60+
std::vector<size_t> query_indices = {0, 3, 4, 8, 10, 19, 28, 31,
61+
39, 40, 41, 47, 49, 50, 56, 64,
62+
68, 70, 71, 79, 82, 89, 90, 94};
63+
std::vector<std::span<siftsmall_feature_type>> queries;
64+
for (size_t idx : query_indices) {
65+
queries.push_back(base[idx]);
66+
}
67+
68+
auto queries_matrix = ColMajorMatrix<siftsmall_feature_type>(
69+
queries[0].size(), query_indices.size());
70+
for (size_t i = 0; i < query_indices.size(); ++i) {
71+
for (size_t j = 0; j < queries[0].size(); ++j) {
72+
queries_matrix(j, i) = queries[i][j];
73+
}
74+
}
75+
auto queries_feature_vector_array = FeatureVectorArray(queries_matrix);
76+
77+
for (const auto& directory_name :
78+
std::filesystem::directory_iterator(datasets_path)) {
79+
std::string version_path = directory_name.path().string();
80+
if (!std::filesystem::is_directory(version_path)) {
81+
continue;
82+
}
83+
84+
for (const auto& index_name :
85+
std::filesystem::directory_iterator(version_path)) {
86+
std::string index_uri = index_name.path().string();
87+
if (!std::filesystem::is_directory(index_uri)) {
88+
continue;
89+
}
90+
// TODO(paris): Fix bug where we can't load old indexes b/c of a storage
91+
// version mismatch in metadata.
92+
if (index_uri.find("0.0.10") != std::string::npos ||
93+
index_uri.find("0.0.17") != std::string::npos) {
94+
continue;
95+
}
96+
std::vector<float> expected_distances(query_indices.size(), 0.0);
97+
if (index_uri.find("ivf_flat") != std::string::npos) {
98+
auto index = IndexIVFFlat(ctx, index_uri);
99+
auto&& [scores, ids] =
100+
index.query_infinite_ram(queries_feature_vector_array, 1, 10);
101+
auto scores_span =
102+
MatrixView<siftsmall_feature_type, stdx::layout_left>{
103+
(siftsmall_feature_type*)scores.data(),
104+
extents(scores)[0],
105+
extents(scores)[1]};
106+
107+
auto ids_span = MatrixView<siftsmall_ids_type, stdx::layout_left>{
108+
(siftsmall_ids_type*)ids.data(), extents(ids)[0], extents(ids)[1]};
109+
110+
for (size_t i = 0; i < query_indices.size(); ++i) {
111+
CHECK(ids_span[0][i] == query_indices[i]);
112+
CHECK(scores_span[0][i] == 0);
113+
}
114+
} else if (index_uri.find("flat") != std::string::npos) {
115+
// TODO(paris): Fix flat_l2_index and re-enable. Right now it just tries
116+
// to load the URI as a tdbMatrix.
117+
} else if (index_uri.find("vamana") != std::string::npos) {
118+
// TODO(paris): Add once we save a vamana index in backwards
119+
// compatibility data.
120+
} else {
121+
REQUIRE(false);
122+
}
123+
}
124+
}
125+
}

0 commit comments

Comments
 (0)