Skip to content

Commit b76d098

Browse files
authored
feature: add dynamic support for flat index (#160)
To help integration with [Faiss](facebookresearch/faiss#4450), this PR adds dynamic data support for flat index by using a generic Data instead of ImmutableMemoryDataset concept, can pass simple data to allow resize
1 parent 18ba515 commit b76d098

File tree

9 files changed

+1758
-0
lines changed

9 files changed

+1758
-0
lines changed

bindings/python/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ add_subdirectory("../.." "${CMAKE_CURRENT_BINARY_DIR}/svs")
3131
set(CPP_FILES
3232
src/allocator.cpp
3333
src/dynamic_vamana.cpp
34+
src/dynamic_flat.cpp
3435
src/core.cpp
3536
src/flat.cpp
3637
src/python_bindings.cpp
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*
2+
* Copyright 2025 Intel Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <pybind11/pybind11.h>
20+
21+
namespace svs::python::dynamic_flat {
22+
void wrap(pybind11::module& m);
23+
} // namespace svs::python::dynamic_flat
Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
/*
2+
* Copyright 2025 Intel Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
// svs python bindings
18+
#include "svs/python/dynamic_flat.h"
19+
#include "svs/python/common.h"
20+
#include "svs/python/core.h"
21+
#include "svs/python/flat.h"
22+
#include "svs/python/manager.h"
23+
24+
// svs
25+
#include "svs/lib/dispatcher.h"
26+
#include "svs/orchestrators/dynamic_flat.h"
27+
28+
// pybind
29+
#include <pybind11/numpy.h>
30+
#include <pybind11/pybind11.h>
31+
#include <pybind11/stl.h>
32+
33+
// stl
34+
#include <span>
35+
36+
/////
37+
///// DynamicFlat
38+
/////
39+
40+
namespace py = pybind11;
41+
namespace svs::python::dynamic_flat {
42+
43+
namespace {
44+
45+
template <typename F> void for_standard_specializations(F&& f) {
46+
#define X(Q, T, N) f.template operator()<Q, T, N>()
47+
// Pattern:
48+
// QueryType, DataType, Dimensionality
49+
// clang-format off
50+
X(float, float, Dynamic);
51+
X(float, svs::Float16, Dynamic);
52+
X(uint8_t, uint8_t, Dynamic);
53+
X(int8_t, int8_t, Dynamic);
54+
// clang-format on
55+
#undef X
56+
}
57+
58+
template <typename ElementType>
59+
svs::DynamicFlat build_from_array(
60+
py_contiguous_array_t<ElementType> py_data,
61+
py_contiguous_array_t<size_t> py_ids,
62+
svs::DistanceType distance_type,
63+
size_t num_threads
64+
) {
65+
auto dispatcher = svs::DistanceDispatcher(distance_type);
66+
return dispatcher([&](auto distance) {
67+
return svs::DynamicFlat::build<ElementType>(
68+
create_blocked_data(py_data),
69+
std::span(py_ids.data(), py_ids.size()),
70+
distance,
71+
num_threads
72+
);
73+
});
74+
}
75+
76+
const char* BUILD_FROM_ARRAY_DOC = R"(
77+
Construct a DynamicFlat index over the given data with custom IDs, returning a searchable index.
78+
79+
Args:
80+
data: The dataset to index. **NOTE**: SVS will maintain an internal copy of the
81+
dataset. This may change in future releases.
82+
ids: Vector of ids to assign to each row in ``data``. Must have the same number of
83+
elements as ``data`` has rows.
84+
distance_type: The distance type to use for this dataset.
85+
num_threads: Number of threads for index construction.
86+
)";
87+
88+
template <typename ElementType>
89+
void add_build_specialization(py::class_<svs::DynamicFlat>& index) {
90+
index.def_static(
91+
"build",
92+
&build_from_array<ElementType>,
93+
py::arg("data"),
94+
py::arg("ids"),
95+
py::arg("distance_type"),
96+
py::arg("num_threads") = 1,
97+
BUILD_FROM_ARRAY_DOC
98+
);
99+
}
100+
101+
template <typename ElementType>
102+
void add_points(
103+
svs::DynamicFlat& index,
104+
const py_contiguous_array_t<ElementType>& py_data,
105+
const py_contiguous_array_t<size_t>& ids,
106+
bool reuse_empty = false
107+
) {
108+
if (py_data.ndim() != 2) {
109+
throw ANNEXCEPTION("Expected points to have 2 dimensions!");
110+
}
111+
if (ids.ndim() != 1) {
112+
throw ANNEXCEPTION("Expected ids to have 1 dimension!");
113+
}
114+
if (py_data.shape(0) != ids.shape(0)) {
115+
throw ANNEXCEPTION(
116+
"Expected IDs to be the same length as the number of rows in points!"
117+
);
118+
}
119+
index.add_points(data_view(py_data), std::span(ids.data(), ids.size()), reuse_empty);
120+
}
121+
122+
const char* ADD_POINTS_DOCSTRING = R"(
123+
Add every point in ``points`` to the data, assigning the element-wise corresponding ID to
124+
each point.
125+
126+
Args:
127+
points: A matrix of data whose rows, corresponding to points in R^n, will be added to
128+
the data.
129+
ids: Vector of ids to assign to each row in ``points``. Must have the same number of
130+
elements as ``points`` has rows.
131+
reuse_empty: A flag that determines whether to reuse empty entries that may exist after deletion. When enabled,
132+
scan from the beginning to find and fill these empty entries when adding new points.
133+
134+
Furthermore, all entries in ``ids`` must be unique and not already exist in the data.
135+
If either of these does not hold, an exception will be thrown without mutating the
136+
underlying data.
137+
138+
When ``delete`` is called, vectors are directly removed and their slots become available for reuse.
139+
When ``add_points`` is called with the ``reuse_empty`` flag enabled, the memory is scanned from the beginning to locate and fill these empty entries with new points.
140+
)";
141+
142+
template <typename ElementType>
143+
void add_points_specialization(py::class_<svs::DynamicFlat>& index) {
144+
index.def(
145+
"add",
146+
&add_points<ElementType>,
147+
py::arg("points"),
148+
py::arg("ids"),
149+
py::arg("reuse_empty") = false,
150+
ADD_POINTS_DOCSTRING
151+
);
152+
}
153+
154+
///// Docstrings
155+
// Put docstrings here to hopefully make the implementation of `wrap` a bit less
156+
// cluttered.
157+
const char* CONSOLIDATE_DOCSTRING = R"(
158+
No-op method for compatibility with dynamic index interface.
159+
For the flat index, deletion is performed directly, so consolidation is not needed.
160+
)";
161+
162+
const char* COMPACT_DOCSTRING = R"(
163+
Remove any holes created in the data by renumbering internal IDs.
164+
Shrink the underlying data structures.
165+
Following ``consolidate``, this can potentially reduce the memory footprint of the data
166+
if a sufficient number of points were deleted.
167+
)";
168+
169+
const char* DELETE_DOCSTRING = R"(
170+
Delete the IDs from the data. This removes the specified vectors from the dataset
171+
so they will not be returned from future searches.
172+
173+
Args:
174+
ids: The IDs to delete.
175+
176+
Each element in IDs must be unique and must correspond to a valid ID stored in the data.
177+
Otherwise, an exception will be thrown. If an exception is thrown for this reason, the
178+
data will be left unchanged from before the function call.
179+
)";
180+
181+
const char* ALL_IDS_DOCSTRING = R"(
182+
Return a Numpy vector of all IDs currently in the data.
183+
)";
184+
185+
// Index saving.
186+
void save_index(
187+
const svs::DynamicFlat& index,
188+
const std::string& config_dir,
189+
const std::string& data_dir
190+
) {
191+
index.save(config_dir, data_dir);
192+
}
193+
194+
/////
195+
///// Assembly
196+
/////
197+
198+
template <typename Q, typename T, size_t N>
199+
svs::DynamicFlat assemble_uncompressed(
200+
const std::filesystem::path& config_directory,
201+
svs::VectorDataLoader<T, N, RebindAllocator<T>> datafile,
202+
svs::DistanceType distance_type,
203+
size_t num_threads
204+
) {
205+
auto dispatcher = svs::DistanceDispatcher(distance_type);
206+
return dispatcher([&](auto distance) {
207+
return svs::DynamicFlat::assemble<Q>(
208+
config_directory, datafile, distance, num_threads
209+
);
210+
});
211+
}
212+
213+
template <typename Dispatcher> void register_assembly(Dispatcher& dispatcher) {
214+
for_standard_specializations([&]<typename Q, typename T, size_t N>() {
215+
dispatcher.register_target(&assemble_uncompressed<Q, T, N>);
216+
});
217+
}
218+
219+
using DynamicFlatAssembleTypes = std::variant<UnspecializedVectorDataLoader>;
220+
221+
svs::DynamicFlat assemble(
222+
const std::string& config_directory,
223+
DynamicFlatAssembleTypes data_loader,
224+
svs::DistanceType distance_type,
225+
svs::DataType SVS_UNUSED(query_type),
226+
bool SVS_UNUSED(enforce_dims),
227+
size_t num_threads
228+
) {
229+
auto dispatcher = svs::lib::Dispatcher<
230+
svs::DynamicFlat,
231+
const std::filesystem::path&,
232+
DynamicFlatAssembleTypes,
233+
svs::DistanceType,
234+
size_t>();
235+
236+
register_assembly(dispatcher);
237+
return dispatcher.invoke(
238+
config_directory, std::move(data_loader), distance_type, num_threads
239+
);
240+
}
241+
242+
} // namespace
243+
244+
void wrap(py::module& m) {
245+
std::string name = "DynamicFlat";
246+
py::class_<svs::DynamicFlat> flat(
247+
m, name.c_str(), "Top level class for the dynamic Flat exhaustive search index."
248+
);
249+
250+
add_search_specialization<float>(flat);
251+
add_threading_interface(flat);
252+
add_data_interface(flat);
253+
254+
// Dynamic interface.
255+
flat.def("consolidate", &svs::DynamicFlat::consolidate, CONSOLIDATE_DOCSTRING);
256+
flat.def("compact", &svs::DynamicFlat::compact, COMPACT_DOCSTRING);
257+
258+
// Reloading
259+
flat.def(
260+
py::init(&assemble),
261+
py::arg("config_directory"),
262+
py::arg("data_loader"),
263+
py::arg("distance") = svs::L2,
264+
py::arg("query_type") = svs::DataType::float32,
265+
py::arg("enforce_dims") = false,
266+
py::arg("num_threads") = 1
267+
);
268+
269+
// Index building.
270+
add_build_specialization<float>(flat);
271+
272+
// Index modification.
273+
add_points_specialization<float>(flat);
274+
275+
// Index Deletion.
276+
flat.def(
277+
"delete",
278+
[](svs::DynamicFlat& index, const py_contiguous_array_t<size_t>& ids) {
279+
index.delete_points(as_span(ids));
280+
},
281+
py::arg("ids"),
282+
DELETE_DOCSTRING
283+
);
284+
285+
// ID inspection
286+
flat.def(
287+
"has_id",
288+
&svs::DynamicFlat::has_id,
289+
py::arg("id"),
290+
"Return whether the ID exists in the data."
291+
);
292+
293+
flat.def(
294+
"all_ids",
295+
[](const svs::DynamicFlat& index) {
296+
const auto& v = index.all_ids();
297+
// Populate a numpy-set
298+
auto npv = numpy_vector<size_t>(v.size());
299+
std::copy(v.begin(), v.end(), npv.mutable_unchecked().mutable_data());
300+
return npv;
301+
},
302+
ALL_IDS_DOCSTRING
303+
);
304+
305+
// Saving
306+
flat.def(
307+
"save",
308+
&save_index,
309+
py::arg("config_directory"),
310+
py::arg("data_directory"),
311+
R"(
312+
Save a constructed index to disk with separate config and data directories.
313+
314+
Args:
315+
config_directory: Directory where the config will be saved.
316+
data_directory: Directory where the dataset will be saved.
317+
318+
If the directories do not exist, they will be created if their parents exist.
319+
320+
It is the caller's responsibility to ensure that no existing data will be
321+
overwritten when saving the index to these directories.
322+
)"
323+
);
324+
}
325+
326+
} // namespace svs::python::dynamic_flat

bindings/python/src/python_bindings.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "svs/python/allocator.h"
1919
#include "svs/python/common.h"
2020
#include "svs/python/core.h"
21+
#include "svs/python/dynamic_flat.h"
2122
#include "svs/python/dynamic_vamana.h"
2223
#include "svs/python/flat.h"
2324

@@ -244,6 +245,7 @@ Convert the `fvecs` file on disk with 32-bit floating point entries to a `fvecs`
244245
///// Indexes
245246
// Flat
246247
svs::python::flat::wrap(m);
248+
svs::python::dynamic_flat::wrap(m);
247249

248250
// Vamana
249251
svs::python::vamana::wrap(m);

0 commit comments

Comments
 (0)