Skip to content

Commit d2032d2

Browse files
author
Nikos Papailiou
committed
Add FVEC, IVEC and BVEC inputs
1 parent d0f8405 commit d2032d2

File tree

8 files changed

+127
-1
lines changed

8 files changed

+127
-1
lines changed

apis/python/src/tiledb/vector_search/ingestion.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def ingest(
7070
import math
7171
from typing import Any, Mapping, Optional
7272
import multiprocessing
73+
import os
7374

7475
import numpy as np
7576

@@ -151,6 +152,33 @@ def read_source_metadata(
151152
size = int.from_bytes(f.read(4), "little")
152153
dimensions = int.from_bytes(f.read(4), "little")
153154
return size, dimensions, np.float32
155+
elif source_type == "FVEC":
156+
vfs = tiledb.VFS()
157+
with vfs.open(source_uri, "rb") as f:
158+
dimensions = int.from_bytes(f.read(4), "little")
159+
vector_size = 4 + dimensions * 4
160+
f.seek(0, os.SEEK_END)
161+
file_size = f.tell()
162+
size = int(file_size / vector_size)
163+
return size, dimensions, np.float32
164+
elif source_type == "IVEC":
165+
vfs = tiledb.VFS()
166+
with vfs.open(source_uri, "rb") as f:
167+
dimensions = int.from_bytes(f.read(4), "little")
168+
vector_size = 4 + dimensions * 4
169+
f.seek(0, os.SEEK_END)
170+
file_size = f.tell()
171+
size = int(file_size / vector_size)
172+
return size, dimensions, np.int32
173+
elif source_type == "BVEC":
174+
vfs = tiledb.VFS()
175+
with vfs.open(source_uri, "rb") as f:
176+
dimensions = int.from_bytes(f.read(4), "little")
177+
vector_size = 4 + dimensions
178+
f.seek(0, os.SEEK_END)
179+
file_size = f.tell()
180+
size = int(file_size / vector_size)
181+
return size, dimensions, np.uint8
154182
else:
155183
raise ValueError(f"Not supported source_type {source_type}")
156184

@@ -419,6 +447,40 @@ def read_input_vectors(
419447
).astype(vector_type),
420448
(read_size, dimensions),
421449
)
450+
elif source_type == "FVEC" or source_type == "IVEC":
451+
vfs = tiledb.VFS()
452+
vector_values = 1 + dimensions
453+
vector_size = vector_values * 4
454+
read_size = end_pos - start_pos
455+
read_offset = start_pos * vector_size
456+
with vfs.open(source_uri, "rb") as f:
457+
f.seek(read_offset)
458+
return np.delete(
459+
np.reshape(
460+
np.frombuffer(
461+
f.read(read_size * vector_size),
462+
count=read_size * vector_values,
463+
dtype=vector_type,
464+
).astype(vector_type),
465+
(read_size, dimensions + 1),
466+
), 0, axis=1)
467+
elif source_type == "BVEC":
468+
vfs = tiledb.VFS()
469+
vector_values = 1 + dimensions
470+
vector_size = vector_values * 1
471+
read_size = end_pos - start_pos
472+
read_offset = start_pos * vector_size
473+
with vfs.open(source_uri, "rb") as f:
474+
f.seek(read_offset)
475+
return np.delete(
476+
np.reshape(
477+
np.frombuffer(
478+
f.read(read_size * vector_size),
479+
count=read_size * vector_values,
480+
dtype=vector_type,
481+
).astype(vector_type),
482+
(read_size, dimensions + 1),
483+
), 0, axis=1)
422484

423485
# --------------------------------------------------------------------
424486
# UDFs
@@ -1182,6 +1244,7 @@ def consolidate_and_vacuum(
11821244
else:
11831245
raise err
11841246
group = tiledb.Group(array_uri, "w")
1247+
group.meta["dataset_type"] = "vector_search"
11851248

11861249
in_size, dimensions, vector_type = read_source_metadata(
11871250
source_uri=source_uri, source_type=source_type, logger=logger

apis/python/test/common.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,44 @@ def xbin_mmap(fname, dtype):
1111
return np.memmap(fname, dtype=dtype, mode="r", offset=8, shape=(n, d))
1212

1313

14+
def get_queries_fvec(file, dimensions, nqueries=None):
15+
vfs = tiledb.VFS()
16+
vector_values = 1 + dimensions
17+
vector_size = vector_values * 4
18+
read_size = nqueries
19+
read_offset = 0
20+
with vfs.open(file, "rb") as f:
21+
f.seek(read_offset)
22+
return np.delete(
23+
np.reshape(
24+
np.frombuffer(
25+
f.read(read_size * vector_size),
26+
count=read_size * vector_values,
27+
dtype=np.float32,
28+
).astype(np.float32),
29+
(read_size, dimensions + 1),
30+
), 0, axis=1)
31+
32+
33+
def get_groundtruth_ivec(file, k=None, nqueries=None):
34+
vfs = tiledb.VFS()
35+
vector_values = 1 + k
36+
vector_size = vector_values * 4
37+
read_size = nqueries
38+
read_offset = 0
39+
with vfs.open(file, "rb") as f:
40+
f.seek(read_offset)
41+
return np.delete(
42+
np.reshape(
43+
np.frombuffer(
44+
f.read(read_size * vector_size),
45+
count=read_size * vector_values,
46+
dtype=np.int32,
47+
).astype(np.int32),
48+
(read_size, k + 1),
49+
), 0, axis=1), None
50+
51+
1452
def get_queries(dataset_dir, dtype, nqueries=None):
1553
fname = os.path.join(dataset_dir, "queries")
1654
x = xbin_mmap(fname, dtype=dtype)
4.92 MB
Binary file not shown.
39.5 KB
Binary file not shown.
12.3 MB
Binary file not shown.
50.4 KB
Binary file not shown.

apis/python/test/test_ingestion.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,28 @@ def test_ivf_flat_ingestion_f32(tmp_path):
9393
)
9494
result = np.transpose(index.query(np.transpose(query_vectors), k=k, nprobe=partitions))
9595
assert np.array_equal(np.sort(result, axis=1), np.sort(gt_i, axis=1))
96+
97+
98+
def test_ivf_flat_ingestion_fvec(tmp_path):
99+
source_uri = "test/data/siftsmall/siftsmall_base.fvecs"
100+
queries_uri = "test/data/siftsmall/siftsmall_query.fvecs"
101+
gt_uri = "test/data/siftsmall/siftsmall_groundtruth.ivecs"
102+
source_type = "FVEC"
103+
array_uri = os.path.join(tmp_path, "array")
104+
k = 100
105+
dimensions = 128
106+
partitions = 1000
107+
nqueries = 100
108+
109+
query_vectors = get_queries_fvec(queries_uri, dimensions=dimensions, nqueries=nqueries)
110+
gt_i, gt_d = get_groundtruth_ivec(gt_uri, k=k, nqueries=nqueries)
111+
112+
index = ingest(
113+
index_type="IVF_FLAT",
114+
array_uri=array_uri,
115+
source_uri=source_uri,
116+
source_type=source_type,
117+
partitions=partitions,
118+
)
119+
result = np.transpose(index.query(np.transpose(query_vectors), k=k, nprobe=partitions))
120+
assert np.array_equal(np.sort(result, axis=1), np.sort(gt_i, axis=1))

src/include/detail/linalg/tdb_io.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ void write_matrix(
111111
TILEDB_COL_MAJOR;
112112
query.set_layout(order)
113113
.set_data_buffer(
114-
"values", &A(0, 0), (int)A.num_rows() * (int)A.num_cols())
114+
"values", &A(0, 0), (uint64_t)A.num_rows() * (uint64_t)A.num_cols())
115115
.set_subarray(subarray);
116116
query.submit();
117117

0 commit comments

Comments
 (0)