Skip to content

Commit 9c7acb9

Browse files
Merge pull request #121 from TileDB-Inc/npapa/zstd-compression
Use ZstdFilter as default compression
2 parents 2284ff6 + 491aff7 commit 9c7acb9

File tree

2 files changed

+37
-8
lines changed

2 files changed

+37
-8
lines changed

apis/python/src/tiledb/vector_search/ingestion.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def ingest(
8888
PARTIAL_WRITE_ARRAY_DIR = storage_formats[STORAGE_VERSION][
8989
"PARTIAL_WRITE_ARRAY_DIR"
9090
]
91+
DEFAULT_ATTR_FILTERS = storage_formats[STORAGE_VERSION]["DEFAULT_ATTR_FILTERS"]
9192
VECTORS_PER_WORK_ITEM = 20000000
9293
MAX_TASKS_PER_STAGE = 100
9394
CENTRALISED_KMEANS_MAX_SAMPLE_SIZE = 1000000
@@ -214,7 +215,9 @@ def create_arrays(
214215
parts_array_dom = tiledb.Domain(
215216
parts_array_rows_dim, parts_array_cols_dim
216217
)
217-
parts_attr = tiledb.Attr(name="values", dtype=vector_type)
218+
parts_attr = tiledb.Attr(
219+
name="values", dtype=vector_type, filters=DEFAULT_ATTR_FILTERS
220+
)
218221
parts_schema = tiledb.ArraySchema(
219222
domain=parts_array_dom,
220223
sparse=False,
@@ -261,7 +264,9 @@ def create_arrays(
261264
centroids_array_rows_dim, centroids_array_cols_dim
262265
)
263266
centroids_attr = tiledb.Attr(
264-
name="centroids", dtype=np.dtype(np.float32)
267+
name="centroids",
268+
dtype=np.dtype(np.float32),
269+
filters=DEFAULT_ATTR_FILTERS,
265270
)
266271
centroids_schema = tiledb.ArraySchema(
267272
domain=centroids_array_dom,
@@ -284,7 +289,11 @@ def create_arrays(
284289
dtype=np.dtype(np.int32),
285290
)
286291
index_array_dom = tiledb.Domain(index_array_rows_dim)
287-
index_attr = tiledb.Attr(name="values", dtype=np.dtype(np.uint64))
292+
index_attr = tiledb.Attr(
293+
name="values",
294+
dtype=np.dtype(np.uint64),
295+
filters=DEFAULT_ATTR_FILTERS,
296+
)
288297
index_schema = tiledb.ArraySchema(
289298
domain=index_array_dom,
290299
sparse=False,
@@ -306,7 +315,11 @@ def create_arrays(
306315
dtype=np.dtype(np.int32),
307316
)
308317
ids_array_dom = tiledb.Domain(ids_array_rows_dim)
309-
ids_attr = tiledb.Attr(name="values", dtype=np.dtype(np.uint64))
318+
ids_attr = tiledb.Attr(
319+
name="values",
320+
dtype=np.dtype(np.uint64),
321+
filters=DEFAULT_ATTR_FILTERS,
322+
)
310323
ids_schema = tiledb.ArraySchema(
311324
domain=ids_array_dom,
312325
sparse=False,
@@ -336,7 +349,9 @@ def create_arrays(
336349
parts_array_dom = tiledb.Domain(
337350
parts_array_rows_dim, parts_array_cols_dim
338351
)
339-
parts_attr = tiledb.Attr(name="values", dtype=vector_type)
352+
parts_attr = tiledb.Attr(
353+
name="values", dtype=vector_type, filters=DEFAULT_ATTR_FILTERS
354+
)
340355
parts_schema = tiledb.ArraySchema(
341356
domain=parts_array_dom,
342357
sparse=False,
@@ -386,7 +401,11 @@ def create_arrays(
386401
dtype=np.dtype(np.int32),
387402
)
388403
ids_array_dom = tiledb.Domain(ids_array_rows_dim)
389-
ids_attr = tiledb.Attr(name="values", dtype=np.dtype(np.uint64))
404+
ids_attr = tiledb.Attr(
405+
name="values",
406+
dtype=np.dtype(np.uint64),
407+
filters=DEFAULT_ATTR_FILTERS,
408+
)
390409
ids_schema = tiledb.ArraySchema(
391410
domain=ids_array_dom,
392411
sparse=False,
@@ -418,7 +437,9 @@ def create_arrays(
418437
parts_array_dom = tiledb.Domain(
419438
parts_array_rows_dim, parts_array_cols_dim
420439
)
421-
parts_attr = tiledb.Attr(name="values", dtype=vector_type)
440+
parts_attr = tiledb.Attr(
441+
name="values", dtype=vector_type, filters=DEFAULT_ATTR_FILTERS
442+
)
422443
parts_schema = tiledb.ArraySchema(
423444
domain=parts_array_dom,
424445
sparse=False,
@@ -445,7 +466,11 @@ def create_arrays(
445466
dtype=np.dtype(np.int32),
446467
)
447468
index_array_dom = tiledb.Domain(index_array_rows_dim)
448-
index_attr = tiledb.Attr(name="values", dtype=np.dtype(np.uint64))
469+
index_attr = tiledb.Attr(
470+
name="values",
471+
dtype=np.dtype(np.uint64),
472+
filters=DEFAULT_ATTR_FILTERS,
473+
)
449474
index_schema = tiledb.ArraySchema(
450475
domain=index_array_dom,
451476
sparse=False,

apis/python/src/tiledb/vector_search/storage_formats.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
1+
import tiledb
2+
13
storage_formats = {
24
"0.1": {
35
"CENTROIDS_ARRAY_NAME": "centroids.tdb",
46
"INDEX_ARRAY_NAME": "index.tdb",
57
"IDS_ARRAY_NAME": "ids.tdb",
68
"PARTS_ARRAY_NAME": "parts.tdb",
79
"PARTIAL_WRITE_ARRAY_DIR": "write_temp",
10+
"DEFAULT_ATTR_FILTERS": None,
811
},
912
"0.2": {
1013
"CENTROIDS_ARRAY_NAME": "partition_centroids",
1114
"INDEX_ARRAY_NAME": "partition_indexes",
1215
"IDS_ARRAY_NAME": "shuffled_vector_ids",
1316
"PARTS_ARRAY_NAME": "shuffled_vectors",
1417
"PARTIAL_WRITE_ARRAY_DIR": "temp_data",
18+
"DEFAULT_ATTR_FILTERS": tiledb.FilterList([tiledb.ZstdFilter()]),
1519
},
1620
}
1721

0 commit comments

Comments
 (0)