Skip to content

Commit edee144

Browse files
Merge pull request #67 from TileDB-Inc/npapa/cpp_ingestion
Use native vector ingestion in Python taskgraphs
2 parents 2ed2004 + ed1aa78 commit edee144

File tree

16 files changed

+787
-240
lines changed

16 files changed

+787
-240
lines changed
Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,19 @@
1-
from .index import FlatIndex
1+
from .index import FlatIndex, IVFFlatIndex
22
from .ingestion import ingest
33
from .module import load_as_array
44
from .module import load_as_matrix
5-
from .module import query_vq, query_kmeans, validate_top_k, array_to_matrix
5+
from .module import query_vq, query_kmeans, validate_top_k, array_to_matrix, ivf_index, ivf_index_tdb
66

77
__all__ = [
88
"FlatIndex",
9+
"IVFFlatIndex",
910
"load_as_array",
1011
"load_as_matrix",
1112
"ingest",
1213
"query_vq",
1314
"query_kmeans",
1415
"validate_top_k",
16+
"ivf_index",
17+
"ivf_index_tdb",
1518
"array_to_matrix",
1619
]

apis/python/src/tiledb/vector_search/index.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import os
22

33
import numpy as np
4-
54
from tiledb.vector_search.module import *
65

76

apis/python/src/tiledb/vector_search/ingestion.py

Lines changed: 304 additions & 192 deletions
Large diffs are not rendered by default.

apis/python/src/tiledb/vector_search/module.cc

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,56 @@ static void declare_kmeans_query(py::module& m, const std::string& suffix) {
124124
}, py::keep_alive<1,2>());
125125
}
126126

127+
template <typename T>
128+
static void declare_ivf_index(py::module& m, const std::string& suffix) {
129+
m.def(("ivf_index_" + suffix).c_str(),
130+
[](tiledb::Context& ctx,
131+
const ColMajorMatrix<T>& db,
132+
const std::string& centroids_uri,
133+
const std::string& parts_uri,
134+
const std::string& index_uri,
135+
const std::string& id_uri,
136+
size_t start_pos,
137+
size_t end_pos,
138+
size_t nthreads) -> int {
139+
return detail::ivf::ivf_index<T, uint64_t, float>(
140+
ctx,
141+
db,
142+
centroids_uri,
143+
parts_uri,
144+
index_uri,
145+
id_uri,
146+
start_pos,
147+
end_pos,
148+
nthreads);
149+
}, py::keep_alive<1,2>());
150+
}
151+
152+
template <typename T>
153+
static void declare_ivf_index_tdb(py::module& m, const std::string& suffix) {
154+
m.def(("ivf_index_tdb_" + suffix).c_str(),
155+
[](tiledb::Context& ctx,
156+
const std::string& db_uri,
157+
const std::string& centroids_uri,
158+
const std::string& parts_uri,
159+
const std::string& index_uri,
160+
const std::string& id_uri,
161+
size_t start_pos,
162+
size_t end_pos,
163+
size_t nthreads) -> int {
164+
return detail::ivf::ivf_index<T, uint64_t, float>(
165+
ctx,
166+
db_uri,
167+
centroids_uri,
168+
parts_uri,
169+
index_uri,
170+
id_uri,
171+
start_pos,
172+
end_pos,
173+
nthreads);
174+
}, py::keep_alive<1,2>());
175+
}
176+
127177

128178
// Declarations for typed subclasses of ColMajorMatrix
129179
template <typename P>
@@ -273,4 +323,9 @@ PYBIND11_MODULE(_tiledbvspy, m) {
273323
declare_kmeans_query<uint8_t>(m, "u8");
274324
declare_kmeans_query<float>(m, "f32");
275325

326+
declare_ivf_index<uint8_t>(m, "u8");
327+
declare_ivf_index<float>(m, "f32");
328+
declare_ivf_index_tdb<uint8_t>(m, "u8");
329+
declare_ivf_index_tdb<float>(m, "f32");
330+
276331
}

apis/python/src/tiledb/vector_search/module.py

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import tiledb
66
from tiledb.vector_search._tiledbvspy import *
7-
from tiledb.vector_search import _tiledbvspy as cc
87

98

109
def load_as_matrix(path: str, nqueries: int = 0, config: Dict = {}):
@@ -82,6 +81,84 @@ def query_vq(db: "colMajorMatrix", *args):
8281
raise TypeError("Unknown type!")
8382

8483

84+
def ivf_index_tdb(
85+
dtype: np.dtype,
86+
db_uri: str,
87+
centroids_uri: str,
88+
parts_uri: str,
89+
index_uri: str,
90+
id_uri: str,
91+
start: int = 0,
92+
end: int = 0,
93+
nthreads: int = 0,
94+
config: Dict = None,
95+
):
96+
if config is None:
97+
ctx = Ctx({})
98+
else:
99+
ctx = Ctx(config)
100+
101+
args = tuple(
102+
[
103+
ctx,
104+
db_uri,
105+
centroids_uri,
106+
parts_uri,
107+
index_uri,
108+
id_uri,
109+
start,
110+
end,
111+
nthreads
112+
]
113+
)
114+
115+
if dtype == np.float32:
116+
return ivf_index_tdb_f32(*args)
117+
elif dtype == np.uint8:
118+
return ivf_index_tdb_u8(*args)
119+
else:
120+
raise TypeError("Unknown type!")
121+
122+
123+
def ivf_index(
124+
dtype: np.dtype,
125+
db: "colMajorMatrix",
126+
centroids_uri: str,
127+
parts_uri: str,
128+
index_uri: str,
129+
id_uri: str,
130+
start: int = 0,
131+
end: int = 0,
132+
nthreads: int = 0,
133+
config: Dict = None,
134+
):
135+
if config is None:
136+
ctx = Ctx({})
137+
else:
138+
ctx = Ctx(config)
139+
140+
args = tuple(
141+
[
142+
ctx,
143+
db,
144+
centroids_uri,
145+
parts_uri,
146+
index_uri,
147+
id_uri,
148+
start,
149+
end,
150+
nthreads
151+
]
152+
)
153+
154+
if dtype == np.float32:
155+
return ivf_index_f32(*args)
156+
elif dtype == np.uint8:
157+
return ivf_index_u8(*args)
158+
else:
159+
raise TypeError("Unknown type!")
160+
161+
85162
def query_kmeans(
86163
dtype: np.dtype,
87164
parts_db: "colMajorMatrix",
@@ -150,7 +227,7 @@ def query_kmeans(
150227

151228
def validate_top_k(results: np.ndarray, ground_truth: np.ndarray):
152229
if results.dtype == np.uint64:
153-
return cc.validate_top_k_u64(results, ground_truth)
230+
return validate_top_k_u64(results, ground_truth)
154231
else:
155232
raise TypeError("Unknown type for validate_top_k!")
156233

apis/python/test/common.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,44 @@ def xbin_mmap(fname, dtype):
1111
return np.memmap(fname, dtype=dtype, mode="r", offset=8, shape=(n, d))
1212

1313

14+
def get_queries_fvec(file, dimensions, nqueries=None):
15+
vfs = tiledb.VFS()
16+
vector_values = 1 + dimensions
17+
vector_size = vector_values * 4
18+
read_size = nqueries
19+
read_offset = 0
20+
with vfs.open(file, "rb") as f:
21+
f.seek(read_offset)
22+
return np.delete(
23+
np.reshape(
24+
np.frombuffer(
25+
f.read(read_size * vector_size),
26+
count=read_size * vector_values,
27+
dtype=np.float32,
28+
).astype(np.float32),
29+
(read_size, dimensions + 1),
30+
), 0, axis=1)
31+
32+
33+
def get_groundtruth_ivec(file, k=None, nqueries=None):
34+
vfs = tiledb.VFS()
35+
vector_values = 1 + k
36+
vector_size = vector_values * 4
37+
read_size = nqueries
38+
read_offset = 0
39+
with vfs.open(file, "rb") as f:
40+
f.seek(read_offset)
41+
return np.delete(
42+
np.reshape(
43+
np.frombuffer(
44+
f.read(read_size * vector_size),
45+
count=read_size * vector_values,
46+
dtype=np.int32,
47+
).astype(np.int32),
48+
(read_size, k + 1),
49+
), 0, axis=1), None
50+
51+
1452
def get_queries(dataset_dir, dtype, nqueries=None):
1553
fname = os.path.join(dataset_dir, "queries")
1654
x = xbin_mmap(fname, dtype=dtype)
4.92 MB
Binary file not shown.
39.5 KB
Binary file not shown.
12.3 MB
Binary file not shown.
50.4 KB
Binary file not shown.

0 commit comments

Comments
 (0)