Merge pull request #443 from aperture-data/release-0.4.28

gsaluja9 · web-flow · commit 6e1e3a4eb0c8 · 2024-06-15T21:06:46.000-04:00
Release 0.4.28
diff --git a/README.md b/README.md
@@ -1,14 +1,14 @@
 # ApertureDB Client Python Module
 
-This is the python sdk for building applications with [ApertureDB](https://docs.aperturedata.io/Introduction/WhatIsAperture).
+This is the Python SDK for building applications with [ApertureDB](https://docs.aperturedata.io/Introduction/WhatIsAperture).
 
-This comprises of utilities to get Data in and out of ApertureDB in an optimal manner.
-A quick [getting started guide](https://docs.aperturedata.io/HowToGuides/start/Setup) is useful to start building with this sdk.
+This comprises of utilities to get sata in and out of ApertureDB in an optimal manner.
+A quick [getting started guide](https://docs.aperturedata.io/HowToGuides/start/Setup) is useful to start building with this SDK.
 For more concrete examples, please refer to:
 * [Simple examples and concepts](https://docs.aperturedata.io/category/simple-usage-examples)
 * [Advanced usage examples](https://docs.aperturedata.io/category/advanced-usage-examples)
 
-# Installing in a custom virtual enviroment.
+# Installing in a custom virtual enviroment
 ```bash
 pip install aperturedb[complete]
 ```
@@ -21,7 +21,7 @@ pip install aperturedb
 A complete [reference](https://docs.aperturedata.io/category/aperturedb-python-sdk) of this SDK is available on the offical [ApertureDB Documentation](https://docs.aperturedata.io)
 
 
-# Dvelopment setup.
+# Development setup
 The recommended way is to clone this repo, and do an editable install as follows:
 ```bash
 git clone https://github.com/aperture-data/aperturedb-python.git
@@ -30,8 +30,8 @@ pip install -e .[dev]
 ```
 
 
-# Running tests.
-The tests are inside the test dir.
+# Running tests
+The tests are inside the `test` dir.
 
 All the tests can be run with:
 
@@ -45,15 +45,15 @@ Running specefic tests can be accomplished by invoking it with pytest as follows
 cd test && docker compose up -d && PROJECT=aperturedata KAGGLE_username=ci KAGGLE_key=dummy coverage run -m python -m pytest test_Session.py -v --log-cli-level=DEBUG
 ```
 
-# Reporting bugs.
+# Reporting bugs
 Any error in the functionality / documentation / tests maybe reported by creating a
 [github issue](https://github.com/aperture-data/aperturedb-python/issues).
 
-# Development guidelines.
+# Development guidelines
 For inclusion of any features, a PR may be created with a patch,
 and a brief description of the problem and the fix.
 The CI enforces a coding style guideline with autopep8 and
 a script to detect trailing white spaces.
 
-In case a PR encounters failures, the log would describe the location of
+If a PR encounters failures, the log will describe the location of
 the offending line with a description of the problem.
diff --git a/aperturedb/ParallelQuery.py b/aperturedb/ParallelQuery.py
@@ -48,43 +48,13 @@ def execute_batch(q: Commands, blobs: Blobs, db: Connector,
 
     if db.last_query_ok():
         if response_handler is not None:
-            # We could potentially always call this handler function
-            # and let the user deal with the error cases.
-            blobs_returned = 0
-            for i in range(math.ceil(len(q) / commands_per_query)):
-                start = i * commands_per_query
-                end = start + commands_per_query
-                blobs_start = i * blobs_per_query
-                blobs_end = blobs_start + blobs_per_query
-
-                b_count = 0
-                if issubclass(type(r), list):
-                    for req, resp in zip(q[start:end], r[start:end]):
-                        for k in req:
-                            # Ref to https://docs.aperturedata.io/query_language/Reference/shared_command_parameters/blobs
-                            blobs_where_default_true = \
-                                k in ["FindImage", "FindBlob", "FindVideo"] and (
-                                    "blobs" not in req[k] or req[k]["blobs"])
-                            blobs_where_default_false = \
-                                k in [
-                                    "FindDescriptor", "FindBoundingBox"] and "blobs" in req[k] and req[k]["blobs"]
-                            if blobs_where_default_true or blobs_where_default_false:
-                                count = resp[k]["returned"]
-                                b_count += count
-
-                try:
-                    # The returned blobs need to be sliced to match the
-                    # returned entities per command in query.
-                    response_handler(
-                        q[start:end],
-                        blobs[blobs_start:blobs_end],
-                        r[start:end] if issubclass(type(r), list) else r,
-                        b[blobs_returned:blobs_returned + b_count] if len(b) >= blobs_returned + b_count else None)
-                except BaseException as e:
-                    logger.exception(e)
-                    if strict_response_validation:
-                        raise e
-                blobs_returned += b_count
+            try:
+                ParallelQuery.map_response_to_handler(response_handler,
+                                                      q, blobs, r, b, commands_per_query, blobs_per_query)
+            except BaseException as e:
+                logger.exception(e)
+                if strict_response_validation:
+                    raise e
     else:
         # Transaction failed entirely.
         logger.error(f"Failed query = {q} with response = {r}")
@@ -140,6 +110,39 @@ def setSuccessStatus(cls, statuses: list[int]):
     def getSuccessStatus(cls):
         return cls.success_statuses
 
+    @classmethod
+    def map_response_to_handler(cls, handler, query, query_blobs,  response, response_blobs,
+                                commands_per_query, blobs_per_query):
+        # We could potentially always call this handler function
+        # and let the user deal with the error cases.
+        blobs_returned = 0
+        for i in range(math.ceil(len(query) / commands_per_query)):
+            start = i * commands_per_query
+            end = start + commands_per_query
+            blobs_start = i * blobs_per_query
+            blobs_end = blobs_start + blobs_per_query
+
+            b_count = 0
+            if issubclass(type(response), list):
+                for req, resp in zip(query[start:end], response[start:end]):
+                    for k in req:
+                        blob_returning_commands = ["FindImage", "FindBlob", "FindVideo",
+                                                   "FindDescriptor", "FindBoundingBox"]
+                        if k in blob_returning_commands and "blobs" in req[k] and req[k]["blobs"]:
+                            count = resp[k]["returned"]
+                            b_count += count
+
+            # The returned blobs need to be sliced to match the
+            # returned entities per command in query.
+            handler(
+                query[start:end],
+                query_blobs[blobs_start:blobs_end],
+                response[start:end] if issubclass(
+                    type(response), list) else response,
+                response_blobs[blobs_returned:blobs_returned + b_count] if
+                len(response_blobs) >= blobs_returned + b_count else None)
+            blobs_returned += b_count
+
     def __init__(self, db: Connector, dry_run: bool = False):
 
         super().__init__()
diff --git a/aperturedb/ParallelQuerySet.py b/aperturedb/ParallelQuerySet.py
@@ -2,6 +2,7 @@
 from typing import Any, Callable, List, Tuple
 import itertools
 import logging
+import math
 
 import numpy as np
 
@@ -28,7 +29,7 @@ def remove_blobs(item: Any) -> Any:
     return item
 
 
-def gen_execute_batch_sets(base_executor, per_batch_response_handler: Callable = None):
+def gen_execute_batch_sets(base_executor):
 
     #
     # execute_batch_sets - executes multiple sets of queries with optional constraints on follow on sets
@@ -47,7 +48,7 @@ def gen_execute_batch_sets(base_executor, per_batch_response_handler: Callable =
     #  execution
     #
     def execute_batch_sets(query_set, blob_set, db, success_statuses: list[int] = [0],
-                           response_handler: Callable = None, commands_per_query: list[int] = -1,
+                           response_handler: Optional[Callable] = None, commands_per_query: list[int] = -1,
                            blobs_per_query: list[int] = -1, strict_response_validation: bool = False):
 
         logger.info("Execute Batch Sets = Batch Size {0}  Comands Per Query {1} Blobs Per Query {2}".format(
@@ -69,13 +70,21 @@ def execute_batch_sets(query_set, blob_set, db, success_statuses: list[int] = [0
         # verify layout if a complex set
         if per_set_blobs:
             first_element_blobs = blob_set[0]
+
+            if len(first_element_blobs) == 0 or len(first_element_blobs) != set_total:
+                # user has confused blob format for sure.
+                logger.error("Malformed blobs for first element. Blob return from your loader "
+                             "should be [query_blobs] where query_blobs = [ first_cmd_list, second_cmd_list, ... ] ")
+                raise Exception(
+                    "Malformed blobs input. Expected First element to have a list of blobs for each set.")
+
             first_query_blobs = first_element_blobs[0]
             # If someone is looking for info logging from PQS, it is likely that blobs are not being set properly.
             #  The wrapping of blobs in general can be confusing. Best suggestion is looking at a loader.
             logger.info("Blobs for first set = " +
-                        str(remove_blobs(blob_set[0])))
+                        str(remove_blobs(first_element_blobs)))
             logger.info("First Blob for first set = " +
-                        str(remove_blobs(blob_set[0][0])))
+                        str(remove_blobs(first_query_blobs)))
             if not isinstance(first_query_blobs, list):
                 logger.error(
                     "Expected a list of lists for the first element's blob sets")
@@ -111,7 +120,7 @@ def set_blob_filter(all_blobs, strike_list, set_nm):
                 # the list comprehension pulls out the blob set for the requested set
                 # the blob set is then flattened as the query expects a flat array using blobs_per_query as the iterator
                 # the flat list is them zipped with the strike list, which determines which blobs are unused
-                # the filter checks if the blob is to be struc
+                # the filter checks if the blob is to be struck
                 # the map pulls the remaining blobs out
 
                 return list(map(lambda pair: pair[0],
@@ -155,9 +164,9 @@ def first_only_blobs(all_blobs, strike_list, set_nm):
 
             # allowed layouts for commands other than the seed command
             # { "cmd" : {} } -> standard single command
-            # [{ "cmd1": {}, "cmd2} : {}] -> standard multiple command
-            # [{ "constraint" : {} , { "cmd" : {} }] -> constraint with a single command
-            # [{ "constraints: {} , [{"cmd1" : {} }, {"cmd2": {} }]] -> constraint with multiple command
+            # [{ "cmd1": {} },{ "cmd2" : {} }] -> standard multiple command
+            # [{ constraints } , { "cmd" : {} }] -> constraint with a single command
+            # [{ constraints } , [{"cmd1" : {} }, {"cmd2": {} }]] -> constraint with multiple command
 
             known_constraint_keys = ["results", "apply"]
             constraints = None
@@ -202,6 +211,10 @@ def constraint_filter(single_line, single_results):
                         passed_all_constraints = True
                         for result_number in result_constraints:
 
+                            if not isinstance(result_number, int):
+                                raise Exception("Keys for result constraints must be numbers: "
+                                                f"{result_number} is {type(result_number)}")
+
                             if len(single_results) < result_number or single_results[result_number] is None:
                                 # in theory here we have two possibilities: a user can have a correctly formed constraint which didn't execute by design
                                 # ( which is what process here )
@@ -278,17 +291,24 @@ def constraint_filter(single_line, single_results):
             blob_strike_list = list(map(lambda q: q is None, queries))
 
             # filter out struck blobs
-            used_blobs = filter(lambda b: b is not None,
-                                blob_filter(blob_set, blob_strike_list, i))
+            used_blobs = list(filter(lambda b: b is not None,
+                                     blob_filter(blob_set, blob_strike_list, i)))
 
-            # TODO: add wrapped response_handler.
-            if response_handler != None:
-                logger.warning(
-                    "ParallelQuerySet does not yet support a response_handler which will identify which set is being worked on")
             if len(executable_queries) > 0:
                 result_code, db_results, db_blobs = base_executor(executable_queries, used_blobs,
                                                                   db, local_success_statuses,
                                                                   None, commands_per_query[i], blobs_per_query[i], strict_response_validation=strict_response_validation)
+                if response_handler != None and db.last_query_ok():
+                    def map_to_set(query, query_blobs, resp, resp_blobs):
+                        response_handler(
+                            i, query, query_blobs, resp, resp_blobs)
+                    try:
+                        ParallelQuery.map_response_to_handler(map_to_set,
+                                                              executable_queries, used_blobs, db_results, db_blobs, commands_per_query[i], blobs_per_query[i])
+                    except BaseException as e:
+                        logger.exception(e)
+                        if strict_response_validation:
+                            raise e
             else:
                 logger.info(
                     f"Skipped executing set {i}, no executable queries")
@@ -364,10 +384,8 @@ def do_batch(self, db: Connector, data: List[Tuple[Commands, Blobs]]) -> None:
         self.commands_per_query = self.generator.commands_per_query
         self.blobs_per_query = self.generator.blobs_per_query
         set_response_handler = None
-        if hasattr(self.generator, "set_response_handler") and callable(self.generator.set_response_handler):
-            set_response_handler = self.generator.set_response_handler
         self.batch_command = gen_execute_batch_sets(
-            self.base_batch_command, set_response_handler)
+            self.base_batch_command)
 
         ParallelQuery.do_batch(self, db, data)
 
@@ -388,7 +406,7 @@ def print_stats(self) -> None:
         else:
             mean = np.mean(times)
             std = np.std(times)
-            tp = 1 / mean * self.numthreads
+            tp = 0 if mean == 0 else 1 / mean * self.numthreads
 
             print(f"Avg Query time (s): {mean}")
             print(f"Query time std: {std}")
diff --git a/aperturedb/Utils.py b/aperturedb/Utils.py
@@ -6,7 +6,7 @@
 import os
 import importlib
 import sys
-from typing import List
+from typing import List, Optional, Dict
 
 from graphviz import Source, Digraph
 
@@ -522,7 +522,8 @@ def count_connections(self, connections_class, constraints=None) -> int:
 
         return total_connections
 
-    def add_descriptorset(self, name: str, dim: int, metric="L2", engine="FaissFlat") -> bool:
+    def add_descriptorset(self, name: str, dim: int, metric="L2", engine="FaissFlat",
+                          properties: Optional[Dict] = None) -> bool:
         """
         Add a descriptor set to the database.
 
@@ -531,6 +532,7 @@ def add_descriptorset(self, name: str, dim: int, metric="L2", engine="FaissFlat"
             dim (int): The dimension of the descriptors.
             metric (str, optional): The metric to use for the descriptors.
             engine (str, optional): The engine to use for the descriptors.
+            properties (dict, optional): The properties of the descriptor set.
 
         Returns:
             success (bool): True if the operation was successful, False otherwise.
@@ -544,6 +546,9 @@ def add_descriptorset(self, name: str, dim: int, metric="L2", engine="FaissFlat"
             }
         }]
 
+        if properties is not None:
+            query[0]["AddDescriptorSet"]["properties"] = properties
+
         try:
             self.execute(query)
         except:
diff --git a/aperturedb/__init__.py b/aperturedb/__init__.py
@@ -7,7 +7,7 @@
 
 logger = logging.getLogger(__name__)
 
-__version__ = "0.4.27"
+__version__ = "0.4.28"
 
 # set log level
 logger.setLevel(logging.DEBUG)
diff --git a/docker/notebook/Dockerfile b/docker/notebook/Dockerfile
@@ -13,7 +13,19 @@ RUN  chmod 755 /start.sh
 # ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /usr/bin/tini
 # RUN chmod +x /usr/bin/tini
 # ENTRYPOINT ["/usr/bin/tini", "--"]
-RUN cd /aperturedata && pip install -e ".[notebook]"
+RUN cd /aperturedata && pip install -e ".[dev]"
+RUN echo "adb --install-completion" | bash
+
+# Install useful JupyterLab extensions
+RUN pip install jupyter-resource-usage
+
+# Suppress the annoying announcements popup
+RUN jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
+
+# Install CLIP (for running transformers)
+RUN pip install git+https://github.com/openai/CLIP.git
+
+RUN apt update && apt install -y curl && apt clean
 
 EXPOSE 8888
 CMD ["/start.sh"]
diff --git a/test/test_ResponseHandler.py b/test/test_ResponseHandler.py