Fix BATCH execution error (#351)

NikolaosPapailiou · web-flow · commit 589958ec5a2a · 2024-04-29T17:26:29.000+03:00
Added tests for BATCH execution in cloud

This was causing a pickling error during BATCH execution. 

```
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/tdbudf/batch_udf_main.py", line 339, in real_main
    result = udf(*args, **kwargs)
  File "/Users/npapa/miniforge3/envs/tiledb_vs_8_arm/lib/python3.9/site-packages/tiledb/vector_search/object_api/embeddings_ingestion.py", line 432, in ingest_embeddings
  File "/opt/conda/lib/python3.9/site-packages/tiledb/cloud/dag/dag.py", line 1162, in compute
    self._batch_taskgraph = self._build_batch_taskgraph()
  File "/opt/conda/lib/python3.9/site-packages/tiledb/cloud/dag/dag.py", line 1534, in _build_batch_taskgraph
    kwargs["executable_code"] = codecs.PickleCodec.encode_base64(func)
  File "/opt/conda/lib/python3.9/site-packages/tiledb/cloud/_results/codecs.py", line 54, in encode_base64
    data_bytes = cls.encode(obj)
  File "/opt/conda/lib/python3.9/site-packages/tiledb/cloud/_results/codecs.py", line 151, in encode
    return cloudpickle.dumps(obj, protocol=_PICKLE_PROTOCOL)
  File "/opt/conda/lib/python3.9/site-packages/cloudpickle/cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "/opt/conda/lib/python3.9/site-packages/cloudpickle/cloudpickle_fast.py", line 632, in dump
    return Pickler.dump(self, obj)
TypeError: cannot pickle 'FilterList' object
```
diff --git a/apis/python/src/tiledb/vector_search/object_api/embeddings_ingestion.py b/apis/python/src/tiledb/vector_search/object_api/embeddings_ingestion.py
@@ -154,6 +154,8 @@ def install_extra_driver_modules():
             import numpy as np
 
             import tiledb
+            from tiledb.vector_search.object_api import ObjectIndex
+            from tiledb.vector_search.storage_formats import storage_formats
 
             def instantiate_object(code, class_name, **kwargs):
                 import importlib.util
@@ -178,7 +180,7 @@ def instantiate_object(code, class_name, **kwargs):
                 return class_(**kwargs)
 
             logger = setup(config, verbose)
-            obj_index = object_index.ObjectIndex(
+            obj_index = ObjectIndex(
                 object_index_uri,
                 config=config,
                 environment_variables=environment_variables,
diff --git a/apis/python/src/tiledb/vector_search/object_api/object_index.py b/apis/python/src/tiledb/vector_search/object_api/object_index.py
@@ -223,16 +223,18 @@ def query(
     def update_object_reader(
         self,
         object_reader: ObjectReader,
+        config: Optional[Mapping[str, Any]] = None,
     ):
-        self.object_reader = object_reader
-        self.object_reader_source_code = get_source_code(object_reader)
-        self.object_reader_class_name = object_reader.__class__.__name__
-        self.object_reader_kwargs = json.dumps(object_reader.init_kwargs())
-        group = tiledb.Group(self.uri, "w")
-        group.meta["object_reader_source_code"] = self.object_reader_source_code
-        group.meta["object_reader_class_name"] = self.object_reader_class_name
-        group.meta["object_reader_kwargs"] = self.object_reader_kwargs
-        group.close()
+        with tiledb.scope_ctx(ctx_or_config=config):
+            self.object_reader = object_reader
+            self.object_reader_source_code = get_source_code(object_reader)
+            self.object_reader_class_name = object_reader.__class__.__name__
+            self.object_reader_kwargs = json.dumps(object_reader.init_kwargs())
+            group = tiledb.Group(self.uri, "w")
+            group.meta["object_reader_source_code"] = self.object_reader_source_code
+            group.meta["object_reader_class_name"] = self.object_reader_class_name
+            group.meta["object_reader_kwargs"] = self.object_reader_kwargs
+            group.close()
 
     def create_embeddings_partitioned_array(
         self,
diff --git a/apis/python/test/common.py b/apis/python/test/common.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 import tiledb
+from tiledb.cloud import groups
 from tiledb.vector_search.storage_formats import STORAGE_VERSION
 from tiledb.vector_search.storage_formats import storage_formats
 
@@ -363,3 +364,31 @@ def quantize_embeddings_int8(
     starts = ranges[0, :]
     steps = (ranges[1, :] - ranges[0, :]) / 255
     return ((embeddings - starts) / steps - 128).astype(np.int8)
+
+
+def setUpCloudToken():
+    token = os.getenv("TILEDB_REST_TOKEN")
+    if os.getenv("TILEDB_CLOUD_HELPER_VAR"):
+        token = os.getenv("TILEDB_CLOUD_HELPER_VAR")
+    tiledb.cloud.login(token=token)
+
+
+def create_cloud_uri(name):
+    namespace, storage_path, _ = groups._default_ns_path_cred()
+    storage_path = storage_path.replace("//", "/").replace("/", "//", 1)
+    rand_name = random_name("vector_search")
+    test_path = f"tiledb://{namespace}/{storage_path}/{rand_name}"
+    return f"{test_path}/{name}"
+
+
+def delete_uri(uri, config):
+    with tiledb.scope_ctx(ctx_or_config=config):
+        try:
+            group = tiledb.Group(uri, "m")
+        except tiledb.TileDBError as err:
+            message = str(err)
+            if "does not exist" in message:
+                return
+            else:
+                raise err
+        group.delete(recursive=True)
diff --git a/apis/python/test/test_object_index.py b/apis/python/test/test_object_index.py
@@ -3,6 +3,7 @@
 import numpy as np
 
 import tiledb
+from tiledb.cloud.dag import Mode
 from tiledb.vector_search.embeddings import ObjectEmbedding
 from tiledb.vector_search.object_api import object_index
 from tiledb.vector_search.object_readers import ObjectPartition
@@ -142,9 +143,9 @@ def read_objects_by_external_ids(self, ids: List[int]) -> OrderedDict:
         return {"object": objects, "external_id": external_ids}
 
 
-def evaluate_query(index_uri, query_kwargs, dim_id, vector_dim_offset):
+def evaluate_query(index_uri, query_kwargs, dim_id, vector_dim_offset, config=None):
     v_id = dim_id - vector_dim_offset
-    index = object_index.ObjectIndex(uri=index_uri)
+    index = object_index.ObjectIndex(uri=index_uri, config=config)
     distances, objects, metadata = index.query(
         {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])}, k=5, **query_kwargs
     )
@@ -188,7 +189,9 @@ def df_filter(row):
         object_ids, np.array([v_id, v_id + 1, v_id + 2, v_id + 3, v_id + 4])
     )
 
-    index = object_index.ObjectIndex(uri=index_uri, load_metadata_in_memory=False)
+    index = object_index.ObjectIndex(
+        uri=index_uri, load_metadata_in_memory=False, config=config
+    )
     distances, objects, metadata = index.query(
         {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])}, k=5, **query_kwargs
     )
@@ -296,6 +299,135 @@ def test_object_index_ivf_flat(tmp_path):
     )
 
 
+def test_object_index_ivf_flat_cloud(tmp_path):
+    from common import create_cloud_uri
+    from common import delete_uri
+    from common import setUpCloudToken
+
+    setUpCloudToken()
+    config = tiledb.cloud.Config().dict()
+    index_uri = create_cloud_uri("object_index_ivf_flat")
+    worker_resources = {"cpu": "1", "memory": "2Gi"}
+    reader = TestReader(
+        object_id_start=0,
+        object_id_end=1000,
+        vector_dim_offset=0,
+    )
+    embedding = TestEmbedding()
+
+    index = object_index.create(
+        uri=index_uri,
+        index_type="IVF_FLAT",
+        object_reader=reader,
+        embedding=embedding,
+        config=config,
+    )
+
+    # Check initial ingestion
+    index.update_index(
+        embeddings_generation_driver_mode=Mode.BATCH,
+        embeddings_generation_mode=Mode.BATCH,
+        vector_indexing_mode=Mode.BATCH,
+        workers=2,
+        worker_resources=worker_resources,
+        driver_resources=worker_resources,
+        kmeans_resources=worker_resources,
+        ingest_resources=worker_resources,
+        consolidate_partition_resources=worker_resources,
+        objects_per_partition=500,
+        partitions=10,
+        config=config,
+    )
+    evaluate_query(
+        index_uri=index_uri,
+        query_kwargs={"nprobe": 10},
+        dim_id=42,
+        vector_dim_offset=0,
+        config=config,
+    )
+    # Check that updating the same data doesn't create duplicates
+    index.update_index(
+        embeddings_generation_driver_mode=Mode.BATCH,
+        embeddings_generation_mode=Mode.BATCH,
+        vector_indexing_mode=Mode.BATCH,
+        workers=2,
+        worker_resources=worker_resources,
+        driver_resources=worker_resources,
+        kmeans_resources=worker_resources,
+        ingest_resources=worker_resources,
+        consolidate_partition_resources=worker_resources,
+        objects_per_partition=500,
+        partitions=10,
+        config=config,
+    )
+    evaluate_query(
+        index_uri=index_uri,
+        query_kwargs={"nprobe": 10},
+        dim_id=42,
+        vector_dim_offset=0,
+        config=config,
+    )
+
+    # Add new data with a new reader
+    reader = TestReader(
+        object_id_start=1000,
+        object_id_end=2000,
+        vector_dim_offset=0,
+    )
+    index.update_object_reader(reader, config=config)
+    index.update_index(
+        embeddings_generation_driver_mode=Mode.BATCH,
+        embeddings_generation_mode=Mode.BATCH,
+        vector_indexing_mode=Mode.BATCH,
+        workers=2,
+        worker_resources=worker_resources,
+        driver_resources=worker_resources,
+        kmeans_resources=worker_resources,
+        ingest_resources=worker_resources,
+        consolidate_partition_resources=worker_resources,
+        objects_per_partition=500,
+        partitions=10,
+        config=config,
+    )
+    evaluate_query(
+        index_uri=index_uri,
+        query_kwargs={"nprobe": 10},
+        dim_id=1042,
+        vector_dim_offset=0,
+        config=config,
+    )
+
+    # Check overwritting existing data
+    reader = TestReader(
+        object_id_start=1000,
+        object_id_end=2000,
+        vector_dim_offset=1000,
+    )
+    index.update_object_reader(reader, config=config)
+    index.update_index(
+        embeddings_generation_driver_mode=Mode.BATCH,
+        embeddings_generation_mode=Mode.BATCH,
+        vector_indexing_mode=Mode.BATCH,
+        workers=2,
+        worker_resources=worker_resources,
+        driver_resources=worker_resources,
+        kmeans_resources=worker_resources,
+        ingest_resources=worker_resources,
+        consolidate_partition_resources=worker_resources,
+        objects_per_partition=500,
+        partitions=10,
+        config=config,
+    )
+    evaluate_query(
+        index_uri=index_uri,
+        query_kwargs={"nprobe": 10},
+        dim_id=2042,
+        vector_dim_offset=1000,
+        config=config,
+    )
+    delete_uri(index_uri, config)
+
+
 def test_object_index_flat(tmp_path):
     reader = TestReader(
         object_id_start=0,