Merge pull request #120 from TileDB-Inc/npapa/numpy-ingestion

NikolaosPapailiou · web-flow · commit d563aa8c9634 · 2023-08-07T13:41:56.000+03:00
Add support for ingesting from in-memory numpy arrays
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -154,14 +154,18 @@ def __init__(
 
         dtype = group.meta.get("dtype", None)
         if dtype is None:
-            schema = tiledb.ArraySchema.load(self.parts_db_uri, ctx=tiledb.Ctx(self.config))
+            schema = tiledb.ArraySchema.load(
+                self.parts_db_uri, ctx=tiledb.Ctx(self.config)
+            )
             self.dtype = np.dtype(schema.attr("values").dtype)
         else:
             self.dtype = np.dtype(dtype)
 
         self.partitions = group.meta.get("partitions", -1)
         if self.partitions == -1:
-            schema = tiledb.ArraySchema.load(self.centroids_uri, ctx=tiledb.Ctx(self.config))
+            schema = tiledb.ArraySchema.load(
+                self.centroids_uri, ctx=tiledb.Ctx(self.config)
+            )
             self.partitions = schema.domain.dim("cols").domain[1] + 1
 
     def query(
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
@@ -3,14 +3,16 @@
 
 from tiledb.cloud.dag import Mode
 from tiledb.vector_search.index import FlatIndex, IVFFlatIndex, Index
+import numpy as np
 
 
 def ingest(
     index_type: str,
     index_uri: str,
-    source_uri: str,
-    source_type: str,
     *,
+    input_vectors: np.ndarray = None,
+    source_uri: str = None,
+    source_type: str = None,
     config=None,
     namespace: Optional[str] = None,
     size: int = -1,
@@ -32,10 +34,12 @@ def ingest(
         Type of vector index (FLAT, IVF_FLAT)
     index_uri: str
         Vector index URI (stored as TileDB group)
+    input_vectors: numpy Array
+        Input vectors, if this is provided it takes precedence over source_uri and source_type.
     source_uri: str
         Data source URI
     source_type: str
-        Type of the source data
+        Type of the source data. If left empty it is auto-detected from the suffix of source_uri
     config: None
         config dictionary, defaults to None
     namespace: str
@@ -88,6 +92,9 @@ def ingest(
     INDEX_ARRAY_NAME = storage_formats[STORAGE_VERSION]["INDEX_ARRAY_NAME"]
     IDS_ARRAY_NAME = storage_formats[STORAGE_VERSION]["IDS_ARRAY_NAME"]
     PARTS_ARRAY_NAME = storage_formats[STORAGE_VERSION]["PARTS_ARRAY_NAME"]
+    INPUT_VECTORS_ARRAY_NAME = storage_formats[STORAGE_VERSION][
+        "INPUT_VECTORS_ARRAY_NAME"
+    ]
     PARTIAL_WRITE_ARRAY_DIR = storage_formats[STORAGE_VERSION][
         "PARTIAL_WRITE_ARRAY_DIR"
     ]
@@ -139,8 +146,22 @@ def setup(
 
         return logger
 
+    def autodetect_source_type(source_uri: str) -> str:
+        if source_uri.endswith(".u8bin"):
+            return "U8BIN"
+        elif source_uri.endswith(".f32bin"):
+            return "F32BIN"
+        elif source_uri.endswith(".fvecs"):
+            return "FVEC"
+        elif source_uri.endswith(".ivecs"):
+            return "IVEC"
+        elif source_uri.endswith(".bvecs"):
+            return "BVEC"
+        else:
+            return "TILEDB_ARRAY"
+
     def read_source_metadata(
-        source_uri: str, source_type: str, logger: logging.Logger
+        source_uri: str, source_type: str = None
     ) -> Tuple[int, int, np.dtype]:
         if source_type == "TILEDB_ARRAY":
             schema = tiledb.ArraySchema.load(source_uri)
@@ -189,6 +210,53 @@ def read_source_metadata(
         else:
             raise ValueError(f"Not supported source_type {source_type}")
 
+    def write_input_vectors(
+        group: tiledb.Group,
+        input_vectors: np.ndarray,
+        size: int,
+        dimensions: int,
+        vector_type: np.dtype,
+    ) -> str:
+        input_vectors_array_uri = f"{group.uri}/{INPUT_VECTORS_ARRAY_NAME}"
+        if tiledb.array_exists(input_vectors_array_uri):
+            raise ValueError(f"Array exists {input_vectors_array_uri}")
+
+        logger.debug("Creating input vectors array")
+        input_vectors_array_rows_dim = tiledb.Dim(
+            name="rows",
+            domain=(0, dimensions - 1),
+            tile=dimensions,
+            dtype=np.dtype(np.int32),
+        )
+        input_vectors_array_cols_dim = tiledb.Dim(
+            name="cols",
+            domain=(0, size - 1),
+            tile=int(size / partitions),
+            dtype=np.dtype(np.int32),
+        )
+        input_vectors_array_dom = tiledb.Domain(
+            input_vectors_array_rows_dim, input_vectors_array_cols_dim
+        )
+        input_vectors_array_attr = tiledb.Attr(
+            name="values", dtype=vector_type, filters=DEFAULT_ATTR_FILTERS
+        )
+        input_vectors_array_schema = tiledb.ArraySchema(
+            domain=input_vectors_array_dom,
+            sparse=False,
+            attrs=[input_vectors_array_attr],
+            cell_order="col-major",
+            tile_order="col-major",
+        )
+        logger.debug(input_vectors_array_schema)
+        tiledb.Array.create(input_vectors_array_uri, input_vectors_array_schema)
+        group.add(input_vectors_array_uri, name=INPUT_VECTORS_ARRAY_NAME)
+
+        input_vectors_array = tiledb.open(input_vectors_array_uri, "w")
+        input_vectors_array[:, :] = np.transpose(input_vectors)
+        input_vectors_array.close()
+
+        return input_vectors_array_uri
+
     def create_arrays(
         group: tiledb.Group,
         index_type: str,
@@ -501,7 +569,7 @@ def read_input_vectors(
         config: Optional[Mapping[str, Any]] = None,
         verbose: bool = False,
         trace_id: Optional[str] = None,
-    ) -> np.array:
+    ) -> np.ndarray:
         logger = setup(config, verbose)
         logger.debug(
             "Reading input vectors start_pos: %i, end_pos: %i", start_pos, end_pos
@@ -669,7 +737,7 @@ def init_centroids(
         config: Optional[Mapping[str, Any]] = None,
         verbose: bool = False,
         trace_id: Optional[str] = None,
-    ) -> np.array:
+    ) -> np.ndarray:
         logger = setup(config, verbose)
         logger.debug(
             "Initialising centroids by reading the first vectors in the source data."
@@ -688,7 +756,7 @@ def init_centroids(
             )
 
     def assign_points_and_partial_new_centroids(
-        centroids: np.array,
+        centroids: np.ndarray,
         source_uri: str,
         source_type: str,
         vector_type: np.dtype,
@@ -859,7 +927,7 @@ def ingest_flat(
             target.close()
 
     def write_centroids(
-        centroids: np.array,
+        centroids: np.ndarray,
         index_group_uri: str,
         partitions: int,
         dimensions: int,
@@ -1379,12 +1447,14 @@ def consolidate_and_vacuum(
         index_group_uri: str,
         config: Optional[Mapping[str, Any]] = None,
     ):
+        group = tiledb.Group(index_group_uri, config=config)
+        if INPUT_VECTORS_ARRAY_NAME in group:
+            tiledb.Array.delete_array(group[INPUT_VECTORS_ARRAY_NAME].uri)
         modes = ["fragment_meta", "commits", "array_meta"]
         for mode in modes:
             conf = tiledb.Config(config)
             conf["sm.consolidation.mode"] = mode
             conf["sm.vacuum.mode"] = mode
-            group = tiledb.Group(index_group_uri, config=conf)
             tiledb.consolidate(group[PARTS_ARRAY_NAME].uri, config=conf)
             tiledb.vacuum(group[PARTS_ARRAY_NAME].uri, config=conf)
             if index_type == "IVF_FLAT":
@@ -1416,9 +1486,24 @@ def consolidate_and_vacuum(
             raise err
         group = tiledb.Group(index_group_uri, "w")
 
-        in_size, dimensions, vector_type = read_source_metadata(
-            source_uri=source_uri, source_type=source_type, logger=logger
-        )
+        if input_vectors is not None:
+            in_size = input_vectors.shape[0]
+            dimensions = input_vectors.shape[1]
+            vector_type = input_vectors.dtype
+            source_uri = write_input_vectors(
+                group=group,
+                input_vectors=input_vectors,
+                size=in_size,
+                dimensions=dimensions,
+                vector_type=vector_type,
+            )
+            source_type = "TILEDB_ARRAY"
+        else:
+            if source_type is None:
+                source_type = autodetect_source_type(source_uri=source_uri)
+            in_size, dimensions, vector_type = read_source_metadata(
+                source_uri=source_uri, source_type=source_type
+            )
         if size == -1:
             size = in_size
         if size > in_size:
diff --git a/apis/python/src/tiledb/vector_search/storage_formats.py b/apis/python/src/tiledb/vector_search/storage_formats.py
@@ -6,6 +6,7 @@
         "INDEX_ARRAY_NAME": "index.tdb",
         "IDS_ARRAY_NAME": "ids.tdb",
         "PARTS_ARRAY_NAME": "parts.tdb",
+        "INPUT_VECTORS_ARRAY_NAME": "input_vectors",
         "PARTIAL_WRITE_ARRAY_DIR": "write_temp",
         "DEFAULT_ATTR_FILTERS": None,
     },
@@ -14,6 +15,7 @@
         "INDEX_ARRAY_NAME": "partition_indexes",
         "IDS_ARRAY_NAME": "shuffled_vector_ids",
         "PARTS_ARRAY_NAME": "shuffled_vectors",
+        "INPUT_VECTORS_ARRAY_NAME": "input_vectors",
         "PARTIAL_WRITE_ARRAY_DIR": "temp_data",
         "DEFAULT_ATTR_FILTERS": tiledb.FilterList([tiledb.ZstdFilter()]),
     },
diff --git a/apis/python/test/common.py b/apis/python/test/common.py
@@ -11,28 +11,6 @@ def xbin_mmap(fname, dtype):
     return np.memmap(fname, dtype=dtype, mode="r", offset=8, shape=(n, d))
 
 
-def get_queries_fvec(file, dimensions, nqueries=None):
-    vfs = tiledb.VFS()
-    vector_values = 1 + dimensions
-    vector_size = vector_values * 4
-    read_size = nqueries
-    read_offset = 0
-    with vfs.open(file, "rb") as f:
-        f.seek(read_offset)
-        return np.delete(
-            np.reshape(
-                np.frombuffer(
-                    f.read(read_size * vector_size),
-                    count=read_size * vector_values,
-                    dtype=np.float32,
-                ).astype(np.float32),
-                (read_size, dimensions + 1),
-            ),
-            0,
-            axis=1,
-        )
-
-
 def get_groundtruth_ivec(file, k=None, nqueries=None):
     vfs = tiledb.VFS()
     vector_values = 1 + k
@@ -104,7 +82,7 @@ def create_random_dataset_f32(nb, d, nq, k, path):
         X, test_size=nq, random_state=1
     )
 
-    with open(os.path.join(path, "data"), "wb") as f:
+    with open(os.path.join(path, "data.f32bin"), "wb") as f:
         np.array([nb, d], dtype="uint32").tofile(f)
         data.astype("float32").tofile(f)
     with open(os.path.join(path, "queries"), "wb") as f:
@@ -138,7 +116,7 @@ def create_random_dataset_u8(nb, d, nq, k, path):
     data = data.astype("uint8")
     queries = queries.astype("uint8")
 
-    with open(os.path.join(path, "data"), "wb") as f:
+    with open(os.path.join(path, "data.u8bin"), "wb") as f:
         np.array([nb, d], dtype="uint32").tofile(f)
         data.tofile(f)
     with open(os.path.join(path, "queries"), "wb") as f:
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py