Add storage format versions and store more metadata in group metadata

Nikos Papailiou · Nikos Papailiou · commit 3fabedac2d9e · 2023-08-02T13:55:17.000+03:00
diff --git a/apis/python/setup.py b/apis/python/setup.py
@@ -35,14 +35,15 @@ def get_cmake_overrides():
         conf.append("-DUSE_MKL_CBLAS={}".format(val))
 
     try:
-      # Make sure we use pybind11 from this python environment if available,
-      # required for windows wheels due to:
-      #   https://github.com/pybind/pybind11/issues/3445
-      import pybind11
-      pb11_path =  pybind11.get_cmake_dir()
-      conf.append(f"-Dpybind11_DIR={pb11_path}")
+        # Make sure we use pybind11 from this python environment if available,
+        # required for windows wheels due to:
+        #   https://github.com/pybind/pybind11/issues/3445
+        import pybind11
+
+        pb11_path = pybind11.get_cmake_dir()
+        conf.append(f"-Dpybind11_DIR={pb11_path}")
     except ImportError:
-      pass
+        pass
 
     return conf
 
@@ -62,5 +63,5 @@ def get_cmake_overrides():
     cmake_args=cmake_args,
     cmake_install_target="install-libtiledbvectorsearch",
     cmake_install_dir="src/tiledb/vector_search",
-    use_scm_version={"root": "../../", "relative_to":  __file__},
+    use_scm_version={"root": "../../", "relative_to": __file__},
 )
diff --git a/apis/python/src/tiledb/vector_search/__init__.py b/apis/python/src/tiledb/vector_search/__init__.py
@@ -1,6 +1,7 @@
 from . import utils
 from .index import FlatIndex, IVFFlatIndex
 from .ingestion import ingest
+from .storage_formats import storage_formats, STORAGE_VERSION
 from .module import load_as_array
 from .module import load_as_matrix
 from .module import (
@@ -34,5 +35,5 @@
     "ivf_index_tdb",
     "array_to_matrix",
     "partition_ivf_index",
-    "utils"
+    "utils",
 ]
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -5,14 +5,10 @@
 
 import numpy as np
 from tiledb.vector_search.module import *
+from tiledb.vector_search.storage_formats import storage_formats
 from tiledb.cloud.dag import Mode
 from typing import Any, Mapping
 
-CENTROIDS_ARRAY_NAME = "centroids.tdb"
-INDEX_ARRAY_NAME = "index.tdb"
-IDS_ARRAY_NAME = "ids.tdb"
-PARTS_ARRAY_NAME = "parts.tdb"
-
 
 def submit_local(d, func, *args, **kwargs):
     # Drop kwarg
@@ -22,7 +18,7 @@ def submit_local(d, func, *args, **kwargs):
 
 
 class Index:
-    def query(self, targets: np.ndarray, k=10, nqueries=10, nthreads=8, nprobe=1):
+    def query(self, targets: np.ndarray, k):
         raise NotImplementedError
 
 
@@ -36,40 +32,40 @@ class FlatIndex(Index):
         URI of datataset
     dtype: numpy.dtype
         datatype float32 or uint8
-    parts_name: str
-        Optional name of partitions
     """
 
     def __init__(
         self,
         uri: str,
-        dtype: Optional[np.dtype] = None,
-        parts_name: str = "parts.tdb",
         config: Optional[Mapping[str, Any]] = None,
     ):
         # If the user passes a tiledb python Config object convert to a dictionary
         if isinstance(config, tiledb.Config):
             config = dict(config)
 
         self.uri = uri
-        self.dtype = dtype
         self._index = None
         self.ctx = Ctx(config)
         self.config = config
+        group = tiledb.Group(uri, ctx=tiledb.Ctx(config))
+        self.storage_version = group.meta.get("storage_version", "0.1")
+        self._db = load_as_matrix(
+            group[storage_formats[self.storage_version]["PARTS_ARRAY_NAME"]].uri,
+            ctx=self.ctx,
+            config=config,
+        )
 
-        self._db = load_as_matrix(os.path.join(uri, parts_name), ctx=self.ctx, config=config)
-
+        dtype = group.meta.get("dtype", None)
         if dtype is None:
             self.dtype = self._db.dtype
         else:
-            self.dtype = dtype
+            self.dtype = np.dtype(dtype)
 
     def query(
         self,
         targets: np.ndarray,
         k: int = 10,
         nthreads: int = 8,
-        nprobe: int = 1,
         query_type="heap",
     ):
         """
@@ -84,9 +80,7 @@ def query(
         nqueries: int
             Number of queries
         nthreads: int
-            Number of threads to use for queyr
-        nprobe: int
-            number of probes
+            Number of threads to use for query
         """
         # TODO:
         # - typecheck targets
@@ -123,7 +117,6 @@ class IVFFlatIndex(Index):
     def __init__(
         self,
         uri,
-        dtype: np.dtype = None,
         memory_budget: int = -1,
         config: Optional[Mapping[str, Any]] = None,
     ):
@@ -134,31 +127,48 @@ def __init__(
         self.config = config
         self.ctx = Ctx(config)
         group = tiledb.Group(uri, ctx=tiledb.Ctx(config))
-        self.parts_db_uri = group[PARTS_ARRAY_NAME].uri
-        self.centroids_uri = group[CENTROIDS_ARRAY_NAME].uri
-        self.index_uri = group[INDEX_ARRAY_NAME].uri
-        self.ids_uri = group[IDS_ARRAY_NAME].uri
+        self.storage_version = group.meta.get("storage_version", "0.1")
+        self.parts_db_uri = group[
+            storage_formats[self.storage_version]["PARTS_ARRAY_NAME"]
+        ].uri
+        self.centroids_uri = group[
+            storage_formats[self.storage_version]["CENTROIDS_ARRAY_NAME"]
+        ].uri
+        self.index_uri = group[
+            storage_formats[self.storage_version]["INDEX_ARRAY_NAME"]
+        ].uri
+        self.ids_uri = group[
+            storage_formats[self.storage_version]["IDS_ARRAY_NAME"]
+        ].uri
         self.memory_budget = memory_budget
 
+        self._centroids = load_as_matrix(
+            self.centroids_uri, ctx=self.ctx, config=config
+        )
+        self._index = read_vector_u64(self.ctx, self.index_uri)
+
         # TODO pass in a context
         if self.memory_budget == -1:
             self._db = load_as_matrix(self.parts_db_uri, ctx=self.ctx, config=config)
             self._ids = read_vector_u64(self.ctx, self.ids_uri)
 
-        self._centroids = load_as_matrix(self.centroids_uri, ctx=self.ctx, config=config)
-
-        # TODO this should always be available
+        dtype = group.meta.get("dtype", None)
         if dtype is None:
-            self.dtype = self._centroids.dtype
+            schema = tiledb.ArraySchema.load(self.parts_db_uri)
+            self.dtype = np.dtype(schema.attr("values").dtype)
         else:
-            self.dtype = dtype
-        self._index = read_vector_u64(self.ctx, self.index_uri)
+            self.dtype = np.dtype(dtype)
+
+        self.partitions = group.meta.get("partitions", -1)
+        if self.partitions == -1:
+            schema = tiledb.ArraySchema.load(self.centroids_uri)
+            self.partitions = schema.domain.dim("cols").domain[1] + 1
 
     def query(
         self,
         queries: np.ndarray,
         k: int = 10,
-        nprobe: int = 10,
+        nprobe: int = 1,
         nthreads: int = -1,
         use_nuv_implementation: bool = False,
         mode: Mode = None,
@@ -198,6 +208,8 @@ def query(
 
         if nthreads == -1:
             nthreads = multiprocessing.cpu_count()
+
+        nprobe = min(nprobe, self.partitions)
         if mode is None:
             queries_m = array_to_matrix(np.transpose(queries))
             if self.memory_budget == -1:
@@ -313,7 +325,7 @@ def dist_qv_udf(
                 active_queries=active_queries,
                 indices=indices,
                 k_nn=k_nn,
-                ctx=Ctx(config)
+                ctx=Ctx(config),
             )
             results = []
             for q in range(len(r)):
@@ -377,9 +389,7 @@ def dist_qv_udf(
                     ids_uri=self.ids_uri,
                     query_vectors=queries,
                     active_partitions=np.array(active_partitions)[part:part_end],
-                    active_queries=np.array(
-                        aq, dtype=object
-                    ),
+                    active_queries=np.array(aq, dtype=object),
                     indices=np.array(self._index),
                     k_nn=k,
                     config=config,
@@ -406,5 +416,5 @@ def dist_qv_udf(
             tmp = sorted(tmp_results, key=lambda t: t[0])[0:k]
             for j in range(len(tmp), k):
                 tmp.append((float(0.0), int(0)))
-            results_per_query.append(np.array(tmp, dtype=np.dtype('float,int'))['f1'])
+            results_per_query.append(np.array(tmp, dtype=np.dtype("float,int"))["f1"])
         return results_per_query
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
@@ -79,12 +79,15 @@ def ingest(
     from tiledb.cloud.rest_api import models
     from tiledb.cloud.utilities import get_logger
     from tiledb.cloud.utilities import set_aws_context
-
-    CENTROIDS_ARRAY_NAME = "centroids.tdb"
-    INDEX_ARRAY_NAME = "index.tdb"
-    IDS_ARRAY_NAME = "ids.tdb"
-    PARTS_ARRAY_NAME = "parts.tdb"
-    PARTIAL_WRITE_ARRAY_DIR = "write_temp"
+    from tiledb.vector_search.storage_formats import storage_formats, STORAGE_VERSION
+
+    CENTROIDS_ARRAY_NAME = storage_formats[STORAGE_VERSION]["CENTROIDS_ARRAY_NAME"]
+    INDEX_ARRAY_NAME = storage_formats[STORAGE_VERSION]["INDEX_ARRAY_NAME"]
+    IDS_ARRAY_NAME = storage_formats[STORAGE_VERSION]["IDS_ARRAY_NAME"]
+    PARTS_ARRAY_NAME = storage_formats[STORAGE_VERSION]["PARTS_ARRAY_NAME"]
+    PARTIAL_WRITE_ARRAY_DIR = storage_formats[STORAGE_VERSION][
+        "PARTIAL_WRITE_ARRAY_DIR"
+    ]
     VECTORS_PER_WORK_ITEM = 20000000
     MAX_TASKS_PER_STAGE = 100
     CENTRALISED_KMEANS_MAX_SAMPLE_SIZE = 1000000
@@ -1378,7 +1381,6 @@ def consolidate_and_vacuum(
                 logger.debug(f"Group '{array_uri}' already exists")
             raise err
         group = tiledb.Group(array_uri, "w")
-        group.meta["dataset_type"] = "vector_search"
 
         in_size, dimensions, vector_type = read_source_metadata(
             source_uri=source_uri, source_type=source_type, logger=logger
@@ -1402,6 +1404,10 @@ def consolidate_and_vacuum(
         logger.debug("Partitions %d", partitions)
         logger.debug("Training sample size %d", training_sample_size)
         logger.debug("Number of workers %d", workers)
+        group.meta["dataset_type"] = "vector_search"
+        group.meta["dtype"] = np.dtype(vector_type).name
+        group.meta["partitions"] = partitions
+        group.meta["storage_version"] = STORAGE_VERSION
 
         if input_vectors_per_work_item == -1:
             input_vectors_per_work_item = VECTORS_PER_WORK_ITEM
@@ -1487,8 +1493,6 @@ def consolidate_and_vacuum(
         consolidate_and_vacuum(array_uri=array_uri, config=config)
 
         if index_type == "FLAT":
-            return FlatIndex(uri=array_uri, dtype=vector_type, config=config)
+            return FlatIndex(uri=array_uri, config=config)
         elif index_type == "IVF_FLAT":
-            return IVFFlatIndex(
-                uri=array_uri, dtype=vector_type, memory_budget=1000000, config=config
-            )
+            return IVFFlatIndex(uri=array_uri, memory_budget=1000000, config=config)
diff --git a/apis/python/src/tiledb/vector_search/module.py b/apis/python/src/tiledb/vector_search/module.py
@@ -8,7 +8,12 @@
 from typing import Optional, Mapping, Any
 
 
-def load_as_matrix(path: str, nqueries: int = 0, ctx: "Ctx" = None, config: Optional[Mapping[str, Any]] = None):
+def load_as_matrix(
+    path: str,
+    nqueries: int = 0,
+    ctx: "Ctx" = None,
+    config: Optional[Mapping[str, Any]] = None,
+):
     """
     Load array as Matrix class
 
@@ -48,7 +53,12 @@ def load_as_matrix(path: str, nqueries: int = 0, ctx: "Ctx" = None, config: Opti
     return m
 
 
-def load_as_array(path, return_matrix: bool = False, ctx: "Ctx" = None, config: Optional[Mapping[str, Any]] = None):
+def load_as_array(
+    path,
+    return_matrix: bool = False,
+    ctx: "Ctx" = None,
+    config: Optional[Mapping[str, Any]] = None,
+):
     """
     Load array as array class
 
diff --git a/apis/python/src/tiledb/vector_search/storage_formats.py b/apis/python/src/tiledb/vector_search/storage_formats.py
@@ -0,0 +1,18 @@
+storage_formats = {
+    "0.1": {
+        "CENTROIDS_ARRAY_NAME": "centroids.tdb",
+        "INDEX_ARRAY_NAME": "index.tdb",
+        "IDS_ARRAY_NAME": "ids.tdb",
+        "PARTS_ARRAY_NAME": "parts.tdb",
+        "PARTIAL_WRITE_ARRAY_DIR": "write_temp",
+    },
+    "0.2": {
+        "CENTROIDS_ARRAY_NAME": "partition_centroids",
+        "INDEX_ARRAY_NAME": "partition_indexes",
+        "IDS_ARRAY_NAME": "shuffled_vector_ids",
+        "PARTS_ARRAY_NAME": "shuffled_vectors",
+        "PARTIAL_WRITE_ARRAY_DIR": "write_temp",
+    },
+}
+
+STORAGE_VERSION = "0.2"
diff --git a/apis/python/src/tiledb/vector_search/utils.py b/apis/python/src/tiledb/vector_search/utils.py
@@ -2,35 +2,39 @@
 import numpy as np
 import io
 
+
 def _load_vecs_t(uri, dtype, ctx_or_config=None):
-  with tiledb.scope_ctx(ctx_or_config) as ctx:
-      dtype = np.dtype(dtype)
-      vfs = tiledb.VFS(ctx.config())
-      with vfs.open(uri, "rb") as f:
-          d = f.read(-1)
-          raw = np.frombuffer(d, dtype=np.uint8)
-          ndim = raw[:4].view(np.int32)[0]
-
-          elem_nbytes = int(4 + ndim * dtype.itemsize)
-          if raw.size % elem_nbytes != 0:
-              raise ValueError(
-                  f"Mismatched dims to bytes in file {uri}: {raw.size}, elem_nbytes"
-              )
-          # take a view on the whole array as
-          # (ndim, sizeof(t)*ndim), and return the actual elements
-          #return raw.view(np.uint8).reshape((elem_nbytes,-1))[4:,:].view(dtype).reshape((ndim,-1))
-
-          if dtype != np.uint8:
-              return raw.view(np.int32).reshape((-1,ndim + 1))[:,1:].view(dtype)
-          else:
-              return raw.view(np.uint8).reshape((-1,ndim + 1))[:,1:].view(dtype)
-          #return raw
+    with tiledb.scope_ctx(ctx_or_config) as ctx:
+        dtype = np.dtype(dtype)
+        vfs = tiledb.VFS(ctx.config())
+        with vfs.open(uri, "rb") as f:
+            d = f.read(-1)
+            raw = np.frombuffer(d, dtype=np.uint8)
+            ndim = raw[:4].view(np.int32)[0]
+
+            elem_nbytes = int(4 + ndim * dtype.itemsize)
+            if raw.size % elem_nbytes != 0:
+                raise ValueError(
+                    f"Mismatched dims to bytes in file {uri}: {raw.size}, elem_nbytes"
+                )
+            # take a view on the whole array as
+            # (ndim, sizeof(t)*ndim), and return the actual elements
+            # return raw.view(np.uint8).reshape((elem_nbytes,-1))[4:,:].view(dtype).reshape((ndim,-1))
+
+            if dtype != np.uint8:
+                return raw.view(np.int32).reshape((-1, ndim + 1))[:, 1:].view(dtype)
+            else:
+                return raw.view(np.uint8).reshape((-1, ndim + 1))[:, 1:].view(dtype)
+            # return raw
+
 
 def load_ivecs(uri, ctx_or_config=None):
-  return _load_vecs_t(uri, np.int32, ctx_or_config)
+    return _load_vecs_t(uri, np.int32, ctx_or_config)
+
 
 def load_fvecs(uri, ctx_or_config=None):
-  return _load_vecs_t(uri, np.float32, ctx_or_config)
+    return _load_vecs_t(uri, np.float32, ctx_or_config)
+
 
 def load_bvecs(uri, ctx_or_config=None):
-  return _load_vecs_t(uri, np.uint8, ctx_or_config)
+    return _load_vecs_t(uri, np.uint8, ctx_or_config)
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py