Skip to content

Commit eff5988

Browse files
Fix some TODOs after the cloud release (#505)
Fix some TODOs after the cloud release
1 parent 2bd17c5 commit eff5988

File tree

3 files changed

+9
-96
lines changed

3 files changed

+9
-96
lines changed

apis/python/src/tiledb/vector_search/ingestion.py

Lines changed: 7 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1684,10 +1684,6 @@ def ingest_vectors_udf(
16841684
trace_id: Optional[str] = None,
16851685
distance_metric: vspy.DistanceMetric = vspy.DistanceMetric.SUM_OF_SQUARES,
16861686
):
1687-
import os
1688-
import random
1689-
import tempfile
1690-
16911687
import tiledb.cloud
16921688
from tiledb.vector_search.module import StdVector_u64
16931689
from tiledb.vector_search.module import array_to_matrix
@@ -1702,11 +1698,6 @@ def ingest_vectors_udf(
17021698
partial_write_array_ids_uri = partial_write_array_group[IDS_ARRAY_NAME].uri
17031699
partial_write_array_parts_uri = partial_write_array_group[PARTS_ARRAY_NAME].uri
17041700
partial_write_array_index_uri = partial_write_array_group[INDEX_ARRAY_NAME].uri
1705-
# Temporary solution until `ivf_index` library change gets released.
1706-
# TODO(nikos) remove this when the `partition_start` parameter of `ivf_index` gets released.
1707-
partial_write_array_index_array = tiledb.open(
1708-
partial_write_array_index_uri, "w", timestamp=index_timestamp
1709-
)
17101701

17111702
for part in range(start, end, batch):
17121703
part_end = part + batch
@@ -1715,32 +1706,7 @@ def ingest_vectors_udf(
17151706

17161707
str(part) + "-" + str(part_end)
17171708
part_id = int(part / batch)
1718-
1719-
# Temporary solution until `ivf_index` library change gets released. We create a local disk
1720-
# temporary array to hold the partial indices and write them to the respective range in the main array.
1721-
# TODO(nikos) remove this when the `partition_start` parameter of `ivf_index` gets released.
1722-
partial_write_array_index_tmp_uri = os.path.join(
1723-
tempfile.gettempdir(),
1724-
f"{random.randint(0,MAX_INT32)}_{part_id}",
1725-
)
1726-
index_array_rows_dim = tiledb.Dim(
1727-
name="rows",
1728-
domain=(0, MAX_INT32),
1729-
tile=100000,
1730-
dtype=np.dtype(np.int32),
1731-
)
1732-
index_array_dom = tiledb.Domain(index_array_rows_dim)
1733-
index_attr = tiledb.Attr(
1734-
name="values",
1735-
dtype=np.dtype(np.uint64),
1736-
)
1737-
index_schema = tiledb.ArraySchema(
1738-
domain=index_array_dom,
1739-
sparse=False,
1740-
attrs=[index_attr],
1741-
)
1742-
tiledb.Array.create(partial_write_array_index_tmp_uri, index_schema)
1743-
partition_start = part_id * (partitions + 1)
1709+
part_id * (partitions + 1)
17441710

17451711
logger.debug("Input vectors start_pos: %d, end_pos: %d", part, part_end)
17461712
updated_ids = read_updated_ids(
@@ -1758,12 +1724,11 @@ def ingest_vectors_udf(
17581724
deleted_ids=StdVector_u64(updated_ids),
17591725
centroids_uri=centroids_uri,
17601726
parts_uri=partial_write_array_parts_uri,
1761-
index_array_uri=partial_write_array_index_tmp_uri,
1762-
# index_array_uri=partial_write_array_index_uri,
1727+
index_array_uri=partial_write_array_index_uri,
17631728
id_uri=partial_write_array_ids_uri,
17641729
start=part,
17651730
end=part_end,
1766-
# partition_start=part_id * (partitions + 1),
1731+
partition_start=part_id * (partitions + 1),
17671732
nthreads=threads,
17681733
**(
17691734
{"timestamp": index_timestamp}
@@ -1801,12 +1766,11 @@ def ingest_vectors_udf(
18011766
deleted_ids=StdVector_u64(updated_ids),
18021767
centroids_uri=centroids_uri,
18031768
parts_uri=partial_write_array_parts_uri,
1804-
index_array_uri=partial_write_array_index_tmp_uri,
1805-
# index_array_uri=partial_write_array_index_uri,
1769+
index_array_uri=partial_write_array_index_uri,
18061770
id_uri=partial_write_array_ids_uri,
18071771
start=part,
18081772
end=part_end,
1809-
# partition_start=part_id * (partitions + 1),
1773+
partition_start=part_id * (partitions + 1),
18101774
nthreads=threads,
18111775
**(
18121776
{"timestamp": index_timestamp}
@@ -1816,15 +1780,6 @@ def ingest_vectors_udf(
18161780
config=config,
18171781
)
18181782

1819-
# Temporary solution until `ivf_index` library change gets released.
1820-
# TODO(nikos) remove this when the `partition_start` parameter of `ivf_index` gets released.
1821-
with tiledb.open(partial_write_array_index_tmp_uri) as a:
1822-
partial_write_array_index_array[
1823-
partition_start : partition_start + partitions + 1
1824-
] = a[0 : partitions + 1]
1825-
tiledb.Array.delete_array(partial_write_array_index_tmp_uri)
1826-
partial_write_array_index_array.close()
1827-
18281783
def ingest_additions_udf(
18291784
index_group_uri: str,
18301785
updates_uri: str,
@@ -1837,10 +1792,6 @@ def ingest_additions_udf(
18371792
verbose: bool = False,
18381793
trace_id: Optional[str] = None,
18391794
):
1840-
import os
1841-
import random
1842-
import tempfile
1843-
18441795
import tiledb.cloud
18451796
from tiledb.vector_search.module import StdVector_u64
18461797
from tiledb.vector_search.module import array_to_matrix
@@ -1855,30 +1806,6 @@ def ingest_additions_udf(
18551806
partial_write_array_parts_uri = partial_write_array_group[PARTS_ARRAY_NAME].uri
18561807
partial_write_array_index_uri = partial_write_array_group[INDEX_ARRAY_NAME].uri
18571808

1858-
# Temporary solution until `ivf_index` library change gets released. We create a local disk
1859-
# temporary array to hold the partial indices and write them to the respective range in the main array.
1860-
# TODO(nikos) remove this when the `partition_start` parameter of `ivf_index` gets released.
1861-
partial_write_array_index_tmp_uri = os.path.join(
1862-
tempfile.gettempdir(), f"{random.randint(0,MAX_INT32)}_{partition_start}"
1863-
)
1864-
index_array_rows_dim = tiledb.Dim(
1865-
name="rows",
1866-
domain=(0, MAX_INT32),
1867-
tile=100000,
1868-
dtype=np.dtype(np.int32),
1869-
)
1870-
index_array_dom = tiledb.Domain(index_array_rows_dim)
1871-
index_attr = tiledb.Attr(
1872-
name="values",
1873-
dtype=np.dtype(np.uint64),
1874-
)
1875-
index_schema = tiledb.ArraySchema(
1876-
domain=index_array_dom,
1877-
sparse=False,
1878-
attrs=[index_attr],
1879-
)
1880-
tiledb.Array.create(partial_write_array_index_tmp_uri, index_schema)
1881-
18821809
additions_vectors, additions_external_ids = read_additions(
18831810
updates_uri=updates_uri,
18841811
config=config,
@@ -1904,29 +1831,16 @@ def ingest_additions_udf(
19041831
deleted_ids=StdVector_u64(np.array([], np.uint64)),
19051832
centroids_uri=centroids_uri,
19061833
parts_uri=partial_write_array_parts_uri,
1907-
index_array_uri=partial_write_array_index_tmp_uri,
1908-
# index_array_uri=partial_write_array_index_uri,
1834+
index_array_uri=partial_write_array_index_uri,
19091835
id_uri=partial_write_array_ids_uri,
19101836
start=write_offset,
19111837
end=0,
1912-
# partition_start=partition_start,
1838+
partition_start=partition_start,
19131839
nthreads=threads,
19141840
**({"timestamp": index_timestamp} if index_timestamp is not None else {}),
19151841
config=config,
19161842
)
19171843

1918-
# Temporary solution until `ivf_index` library change gets released.
1919-
# TODO(nikos) remove this when the `partition_start` parameter of `ivf_index` gets released.
1920-
partial_write_array_index_array = tiledb.open(
1921-
partial_write_array_index_uri, "w", timestamp=index_timestamp
1922-
)
1923-
with tiledb.open(partial_write_array_index_tmp_uri) as a:
1924-
partial_write_array_index_array[
1925-
partition_start : partition_start + partitions + 1
1926-
] = a[0 : partitions + 1]
1927-
tiledb.Array.delete_array(partial_write_array_index_tmp_uri)
1928-
partial_write_array_index_array.close()
1929-
19301844
def compute_partition_indexes_udf(
19311845
index_group_uri: str,
19321846
partitions: int,

apis/python/src/tiledb/vector_search/ivf_flat_index.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -427,8 +427,7 @@ def dist_qv_udf(
427427
k_nn=k_nn,
428428
ctx=Ctx(config),
429429
timestamp=timestamp,
430-
# TODO(nikos) add this after the library change gets released in TileDB cloud
431-
# upper_bound=0 if memory_budget == -1 else memory_budget,
430+
upper_bound=0 if memory_budget == -1 else memory_budget,
432431
)
433432
results = []
434433
for q in range(len(r)):

0 commit comments

Comments
 (0)