Add query with driver implementation (#398)

NikolaosPapailiou · web-flow · commit 5ee2aaa0786f · 2024-06-04T16:04:05.000+02:00
diff --git a/apis/python/src/tiledb/vector_search/flat_index.py b/apis/python/src/tiledb/vector_search/flat_index.py
@@ -35,17 +35,32 @@ class FlatIndex(index.Index):
     timestamp: int or tuple(int)
         If int, open the index at a given timestamp.
         If tuple, open at the given start and end timestamps.
+    open_for_remote_query_execution: bool
+        If `True`, do not load any index data in main memory locally, and instead load index data in the TileDB Cloud taskgraph created when a non-`None` `driver_mode` is passed to `query()`.
+        If `False`, load index data in main memory locally. Note that you can still use a taskgraph for query execution, you'll just end up loading the data both on your local machine and in the cloud taskgraph.
     """
 
     def __init__(
         self,
         uri: str,
         config: Optional[Mapping[str, Any]] = None,
         timestamp=None,
+        open_for_remote_query_execution: bool = False,
         **kwargs,
     ):
+        self.index_open_kwargs = {
+            "uri": uri,
+            "config": config,
+            "timestamp": timestamp,
+        }
+        self.index_open_kwargs.update(kwargs)
         self.index_type = INDEX_TYPE
-        super().__init__(uri=uri, config=config, timestamp=timestamp)
+        super().__init__(
+            uri=uri,
+            config=config,
+            timestamp=timestamp,
+            open_for_remote_query_execution=open_for_remote_query_execution,
+        )
         self._index = None
         self.db_uri = self.group[
             storage_formats[self.storage_version]["PARTS_ARRAY_NAME"]
@@ -69,7 +84,7 @@ def __init__(
             ].uri
         else:
             self.ids_uri = ""
-        if self.size > 0:
+        if self.size > 0 and not open_for_remote_query_execution:
             self._db = load_as_matrix(
                 self.db_uri,
                 ctx=self.ctx,
@@ -121,8 +136,6 @@ def query_internal(
                 (queries.shape[0], k), MAX_UINT64
             )
 
-        assert queries.dtype == np.float32
-
         queries_m = array_to_matrix(np.transpose(queries))
         d, i = query_vq_heap(self._db, queries_m, self._ids, k, nthreads)
 
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -4,6 +4,7 @@
 import time
 from typing import Any, Mapping, Optional
 
+from tiledb.cloud.dag import Mode
 from tiledb.vector_search import _tiledbvspy as vspy
 from tiledb.vector_search.module import *
 from tiledb.vector_search.storage_formats import storage_formats
@@ -35,11 +36,15 @@ class Index:
     timestamp: int or tuple(int)
         If int, open the index at a given timestamp.
         If tuple, open at the given start and end timestamps.
+    open_for_remote_query_execution: bool
+        If `True`, do not load any index data in main memory locally, and instead load index data in the TileDB Cloud taskgraph created when a non-`None` `driver_mode` is passed to `query()`.
+        If `False`, load index data in main memory locally. Note that you can still use a taskgraph for query execution, you'll just end up loading the data both on your local machine and in the cloud taskgraph.
     """
 
     def __init__(
         self,
         uri: str,
+        open_for_remote_query_execution: bool,
         config: Optional[Mapping[str, Any]] = None,
         timestamp=None,
     ):
@@ -48,6 +53,7 @@ def __init__(
             config = dict(config)
 
         self.uri = uri
+        self.open_for_remote_query_execution = open_for_remote_query_execution
         self.config = config
         self.ctx = vspy.Ctx(config)
         self.group = tiledb.Group(self.uri, "r", ctx=tiledb.Ctx(config))
@@ -154,7 +160,66 @@ def __init__(
         self.thread_executor = futures.ThreadPoolExecutor()
         self.has_updates = self._check_has_updates()
 
-    def query(self, queries: np.ndarray, k: int, **kwargs):
+    def _query_with_driver(
+        self,
+        queries: np.ndarray,
+        k: int,
+        driver_mode=None,
+        driver_resources=None,
+        driver_access_credentials_name=None,
+        **kwargs,
+    ):
+        from tiledb.cloud import dag
+
+        def query_udf(index_type, index_open_kwargs, query_kwargs):
+            from tiledb.vector_search.flat_index import FlatIndex
+            from tiledb.vector_search.ivf_flat_index import IVFFlatIndex
+            from tiledb.vector_search.vamana_index import VamanaIndex
+
+            # Open index
+            if index_type == "FLAT":
+                index = FlatIndex(**index_open_kwargs)
+            elif index_type == "IVF_FLAT":
+                index = IVFFlatIndex(**index_open_kwargs)
+            elif index_type == "VAMANA":
+                index = VamanaIndex(**index_open_kwargs)
+
+            # Query index
+            return index.query(**query_kwargs)
+
+        d = dag.DAG(
+            name="vector-query",
+            mode=driver_mode,
+            max_workers=1,
+        )
+        query_kwargs = {
+            "queries": queries,
+            "k": k,
+        }
+        query_kwargs.update(kwargs)
+        node = d.submit(
+            query_udf,
+            self.index_type,
+            self.index_open_kwargs,
+            query_kwargs,
+            name="vector-query-driver",
+            resources=driver_resources,
+            image_name="vectorsearch",
+            access_credentials_name=driver_access_credentials_name,
+        )
+        d.compute()
+        d.wait()
+        return node.result()
+
+    def query(
+        self,
+        queries: np.ndarray,
+        k: int,
+        driver_mode: Mode = None,
+        driver_resources: Optional[str] = None,
+        driver_access_credentials_name: Optional[str] = None,
+        **kwargs,
+    ):
         """
         Queries an index with a set of query vectors, retrieving the `k` most similar vectors for each query.
 
@@ -164,12 +229,23 @@ def query(self, queries: np.ndarray, k: int, **kwargs):
         - Calls the algorithm specific implementation of `query_internal` to query the base data.
         - Merges the results applying the updated data.
 
+        You can control where the query is executed by setting the `driver_mode` parameter:
+        - With `driver_mode = None`, the driver logic for the query will be executed locally.
+        - If `driver_mode` is not `None`, we will use a TileDB cloud taskgraph to re-open the index and run the query.
+        With both options, certain implementations, i.e. IVF Flat, may let you create further TileDB taskgraphs as defined in the implementation specific `query_internal` methods.
+
         Parameters
         ----------
         queries: np.ndarray
             2D array of query vectors. This can be used as a batch query interface by passing multiple queries in one call.
         k: int
             Number of results to return per query vector.
+        driver_mode: Mode
+            If not `None`, the query will be executed in a TileDB cloud taskgraph using the driver mode specified.
+        driver_resources: Optional[str]
+            If `driver_mode` was not `None`, the resources to use for the driver execution.
+        driver_access_credentials_name: Optional[str]
+            If `driver_mode` was not `None`, the access credentials name to use for the driver execution.
         **kwargs
             Extra kwargs passed here are passed to the `query_internal` implementation of the concrete index class.
         """
@@ -184,6 +260,32 @@ def query(self, queries: np.ndarray, k: int, **kwargs):
                 f"A query in queries has {query_dimensions} dimensions, but the indexed data had {self.dimensions} dimensions"
             )
 
+        if queries.dtype != np.float32:
+            raise TypeError(
+                f"Expected queries to have dtype np.float32, but it had dtype {queries.dtype}"
+            )
+
+        if driver_mode == Mode.LOCAL:
+            # @todo: Fix bug with driver_mode=Mode.LOCAL and remove this check.
+            raise TypeError(
+                "Cannot pass driver_mode=Mode.LOCAL to query() - use driver_mode=None to query locally."
+            )
+
+        if driver_mode is not None:
+            return self._query_with_driver(
+                queries,
+                k,
+                driver_mode,
+                driver_resources,
+                driver_access_credentials_name,
+                **kwargs,
+            )
+
+        if self.open_for_remote_query_execution:
+            raise ValueError(
+                "Cannot query an index with driver_mode=None without loading the index data in main memory. Set open_for_remote_query_execution=False when creating the index to load the index data before query."
+            )
+
         with tiledb.scope_ctx(ctx_or_config=self.config):
             if not self.has_updates:
                 if self.query_base_array:
@@ -575,7 +677,6 @@ def _query_additions(
         timestamp=None,
         config=None,
     ):
-        assert queries.dtype == np.float32
         additions_vectors, additions_external_ids, updated_ids = Index._read_additions(
             updates_array_uri, timestamp, config
         )
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
@@ -2855,9 +2855,7 @@ def consolidate_and_vacuum(
         if index_type == "FLAT":
             return flat_index.FlatIndex(uri=index_group_uri, config=config)
         elif index_type == "VAMANA":
-            return vamana_index.VamanaIndex(
-                uri=index_group_uri, config=config, debug=True
-            )
+            return vamana_index.VamanaIndex(uri=index_group_uri, config=config)
         elif index_type == "IVF_FLAT":
             return ivf_flat_index.IVFFlatIndex(
                 uri=index_group_uri, memory_budget=1000000, config=config
diff --git a/apis/python/src/tiledb/vector_search/ivf_flat_index.py b/apis/python/src/tiledb/vector_search/ivf_flat_index.py
@@ -70,6 +70,9 @@ class IVFFlatIndex(index.Index):
         If not provided, all index data are loaded in main memory.
         Otherwise, no index data are loaded in main memory and this memory budget is
         applied during queries.
+    open_for_remote_query_execution: bool
+        If `True`, do not load any index data in main memory locally, and instead load index data in the TileDB Cloud taskgraph created when a non-`None` `driver_mode` is passed to `query()`. We then load index data in the taskgraph based on `memory_budget`.
+        If `False`, load index data in main memory locally according to `memory_budget`. Note that you can still use a taskgraph for query execution, you'll just end up loading the data both on your local machine and in the cloud taskgraph..
     """
 
     def __init__(
@@ -78,10 +81,23 @@ def __init__(
         config: Optional[Mapping[str, Any]] = None,
         timestamp=None,
         memory_budget: int = -1,
+        open_for_remote_query_execution: bool = False,
         **kwargs,
     ):
+        self.index_open_kwargs = {
+            "uri": uri,
+            "config": config,
+            "timestamp": timestamp,
+            "memory_budget": memory_budget,
+        }
+        self.index_open_kwargs.update(kwargs)
         self.index_type = INDEX_TYPE
-        super().__init__(uri=uri, config=config, timestamp=timestamp)
+        super().__init__(
+            uri=uri,
+            config=config,
+            timestamp=timestamp,
+            open_for_remote_query_execution=open_for_remote_query_execution,
+        )
         self.db_uri = self.group[
             storage_formats[self.storage_version]["PARTS_ARRAY_NAME"]
             + self.index_version
@@ -125,28 +141,29 @@ def __init__(
         else:
             self.partitions = self.partition_history[self.history_index]
 
-        self._centroids = load_as_matrix(
-            self.centroids_uri,
-            ctx=self.ctx,
-            size=self.partitions,
-            config=config,
-            timestamp=self.base_array_timestamp,
-        )
-        self._index = read_vector_u64(
-            self.ctx,
-            self.index_array_uri,
-            0,
-            self.partitions + 1,
-            self.base_array_timestamp,
-        )
+        if not open_for_remote_query_execution:
+            self._centroids = load_as_matrix(
+                self.centroids_uri,
+                ctx=self.ctx,
+                size=self.partitions,
+                config=config,
+                timestamp=self.base_array_timestamp,
+            )
+            self._index = read_vector_u64(
+                self.ctx,
+                self.index_array_uri,
+                0,
+                self.partitions + 1,
+                self.base_array_timestamp,
+            )
 
         if self.base_size == -1:
             self.size = self._index[self.partitions]
         else:
             self.size = self.base_size
 
         # TODO pass in a context
-        if self.memory_budget == -1:
+        if not open_for_remote_query_execution and self.memory_budget == -1:
             self._db = load_as_matrix(
                 self.db_uri,
                 ctx=self.ctx,
@@ -225,8 +242,6 @@ def query_internal(
         if (mode != Mode.REALTIME and mode != Mode.BATCH) and resource_class:
             raise TypeError("Can only pass resource_class in REALTIME or BATCH mode")
 
-        assert queries.dtype == np.float32
-
         if queries.ndim == 1:
             queries = np.array([queries])
 
@@ -391,7 +406,6 @@ def dist_qv_udf(
                 results.append(tmp_results)
             return results
 
-        assert queries.dtype == np.float32
         if num_partitions == -1:
             num_partitions = 5
         if num_workers == -1:
diff --git a/apis/python/src/tiledb/vector_search/vamana_index.py b/apis/python/src/tiledb/vector_search/vamana_index.py
@@ -38,17 +38,33 @@ class VamanaIndex(index.Index):
         URI of the index.
     config: Optional[Mapping[str, Any]]
         TileDB config dictionary.
+    open_for_remote_query_execution: bool
+        If `True`, do not load any index data in main memory locally, and instead load index data in the TileDB Cloud taskgraph created when a non-`None` `driver_mode` is passed to `query()`.
+        If `False`, load index data in main memory locally. Note that you can still use a taskgraph for query execution, you'll just end up loading the data both on your local machine and in the cloud taskgraph.
     """
 
     def __init__(
         self,
         uri: str,
         config: Optional[Mapping[str, Any]] = None,
         timestamp=None,
+        open_for_remote_query_execution: bool = False,
         **kwargs,
     ):
-        super().__init__(uri=uri, config=config, timestamp=timestamp)
+        self.index_open_kwargs = {
+            "uri": uri,
+            "config": config,
+            "timestamp": timestamp,
+        }
+        self.index_open_kwargs.update(kwargs)
+        super().__init__(
+            uri=uri,
+            config=config,
+            timestamp=timestamp,
+            open_for_remote_query_execution=open_for_remote_query_execution,
+        )
         self.index_type = INDEX_TYPE
+        # TODO(SC-48710): Add support for `open_for_remote_query_execution`. We don't leave `self.index`` as `None` because we need to be able to call index.dimensions().
         self.index = vspy.IndexVamana(self.ctx, uri, to_temporal_policy(timestamp))
         self.db_uri = self.group[
             storage_formats[self.storage_version]["PARTS_ARRAY_NAME"]
@@ -96,13 +112,11 @@ def query_internal(
         opt_l: int
             How deep to search. Should be >= k, and if it's not, we will set it to k.
         """
-        warnings.warn("The Vamana index is not yet supported, please use with caution.")
         if self.size == 0:
             return np.full((queries.shape[0], k), MAX_FLOAT32), np.full(
                 (queries.shape[0], k), MAX_UINT64
             )
 
-        assert queries.dtype == np.float32
         if opt_l < k:
             warnings.warn(f"opt_l ({opt_l}) should be >= k ({k}), setting to k")
             opt_l = k
@@ -144,7 +158,6 @@ def create(
         The TileDB vector search storage version to use.
         If not provided, use the latest stable storage version.
     """
-    warnings.warn("The Vamana index is not yet supported, please use with caution.")
     validate_storage_version(storage_version)
     ctx = vspy.Ctx(config)
     index = vspy.IndexVamana(
@@ -160,4 +173,4 @@ def create(
     index.train(empty_vector)
     index.add(empty_vector)
     index.write_index(ctx, uri, vspy.TemporalPolicy(0), storage_version)
-    return VamanaIndex(uri=uri, config=config, memory_budget=1000000)
+    return VamanaIndex(uri=uri, config=config)
diff --git a/apis/python/test/test_cloud.py b/apis/python/test/test_cloud.py