Merge pull request #453 from aperture-data/release-0.4.30

gsaluja9 · web-flow · commit 683258615af1 · 2024-07-14T13:32:00.000-04:00
Release 0.4.30
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,9 +10,9 @@ ci:
     skip: []
     submodules: false
 repos:
-    - repo: https://github.com/pre-commit/mirrors-autopep8
-      rev: 898691a
+    - repo: https://github.com/hhatto/autopep8
+      rev: 8b75604
       hooks:
       - id: autopep8
         exclude: _pb2.py$
-        args: ["--ignore", "E251,E241,E221", "-i"]
+        args: ["--ignore", "E251,E241,E221,E402,E265,E275", "-i"]
diff --git a/aperturedb/Descriptors.py b/aperturedb/Descriptors.py
@@ -18,14 +18,16 @@ class Descriptors(Entities):
     def __init__(self, db):
         super().__init__(db)
 
-    def find_similar(self,
-                     set: str,
-                     vector,
-                     k_neighbors: int,
-                     constraints=None,
-                     distances: bool = False,
-                     blobs: bool = False,
-                     results={"all_properties": True}):
+    def find_similar(
+        self,
+        set: str,
+        vector,
+        k_neighbors: int,
+        constraints=None,
+        distances: bool = False,
+        blobs: bool = False,
+        results={"all_properties": True},
+    ):
         """
         Find similar descriptor sets to the input descriptor set.
 
@@ -42,13 +44,15 @@ def find_similar(self,
             results: Response from the server.
         """
 
-        command = {"FindDescriptor": {
-            "set": set,
-            "distances": distances,
-            "blobs": blobs,
-            "results": results,
-            "k_neighbors": k_neighbors,
-        }}
+        command = {
+            "FindDescriptor": {
+                "set": set,
+                "distances": distances,
+                "blobs": blobs,
+                "results": results,
+                "k_neighbors": k_neighbors,
+            }
+        }
 
         if constraints is not None:
             command["FindDescriptor"]["constraints"] = constraints.constraints
@@ -57,7 +61,7 @@ def find_similar(self,
         blobs_in = [np.array(vector, dtype=np.float32).tobytes()]
         _, response, blobs_out = execute_batch(query, blobs_in, self.db)
 
-        self.response = response[0]["FindDescriptor"]["entities"]
+        self.response = response[0]["FindDescriptor"].get("entities", [])
 
         if blobs:
             for i, entity in enumerate(self.response):
@@ -71,7 +75,7 @@ def _descriptorset_metric(self, set: str):
         response, _ = self.db.query(query)
         logger.debug(response)
         assert self.db.last_query_ok(), response
-        return response[0]["FindDescriptorSet"]['entities'][0]["_metrics"][0]
+        return response[0]["FindDescriptorSet"]["entities"][0]["_metrics"][0]
 
     def _vector_similarity(self, v1, v2):
         """Find similarity between two vectors using the metric of the descriptor set."""
@@ -85,13 +89,15 @@ def _vector_similarity(self, v1, v2):
         else:
             raise ValueError("Unknown metric: %s" % self.metric)
 
-    def find_similar_mmr(self,
-                         set: str,
-                         vector,
-                         k_neighbors: int,
-                         fetch_k: int,
-                         lambda_mult: float = 0.5,
-                         **kwargs):
+    def find_similar_mmr(
+        self,
+        set: str,
+        vector,
+        k_neighbors: int,
+        fetch_k: int,
+        lambda_mult: float = 0.5,
+        **kwargs,
+    ):
         """
         As find_similar, but using the MMR algorithm to diversify the results.
 
@@ -132,13 +138,18 @@ def find_similar_mmr(self,
                 unselected.remove(0)
             else:
                 selected_unselected_similarity = np.array(
-                    [[document_similarity[(i, j)] for j in unselected] for i in selected])
+                    [
+                        [document_similarity[(i, j)] for j in unselected]
+                        for i in selected
+                    ]
+                )
                 worst_similarity = np.max(
                     selected_unselected_similarity, axis=0)
                 relevance_scores = np.array(
                     [query_similarity[i] for i in unselected])
-                scores = (1 - lambda_mult) * worst_similarity + \
-                    lambda_mult * relevance_scores
+                scores = (
+                    1 - lambda_mult
+                ) * worst_similarity + lambda_mult * relevance_scores
                 max_index = unselected[np.argmax(scores)]
                 selected.append(max_index)
                 unselected.remove(max_index)
diff --git a/aperturedb/ParallelQuery.py b/aperturedb/ParallelQuery.py
@@ -5,6 +5,7 @@
 import json
 import logging
 import math
+import inspect
 
 
 from aperturedb.DaskManager import DaskManager
@@ -17,7 +18,7 @@
 def execute_batch(q: Commands, blobs: Blobs, db: Connector,
                   success_statuses: list[int] = [0],
                   response_handler: Optional[Callable] = None, commands_per_query: int = 1, blobs_per_query: int = 0,
-                  strict_response_validation: bool = False) -> Tuple[int, CommandResponses, Blobs]:
+                  strict_response_validation: bool = False, cmd_index=None) -> Tuple[int, CommandResponses, Blobs]:
     """
     Execute a batch of queries, doing useful logging around it.
     Calls the response handler if provided.
@@ -50,7 +51,8 @@ def execute_batch(q: Commands, blobs: Blobs, db: Connector,
         if response_handler is not None:
             try:
                 ParallelQuery.map_response_to_handler(response_handler,
-                                                      q, blobs, r, b, commands_per_query, blobs_per_query)
+                                                      q, blobs, r, b, commands_per_query, blobs_per_query,
+                                                      cmd_index)
             except BaseException as e:
                 logger.exception(e)
                 if strict_response_validation:
@@ -112,7 +114,7 @@ def getSuccessStatus(cls):
 
     @classmethod
     def map_response_to_handler(cls, handler, query, query_blobs,  response, response_blobs,
-                                commands_per_query, blobs_per_query):
+                                commands_per_query, blobs_per_query, cmd_index_offset):
         # We could potentially always call this handler function
         # and let the user deal with the error cases.
         blobs_returned = 0
@@ -140,7 +142,8 @@ def map_response_to_handler(cls, handler, query, query_blobs,  response, respons
                 response[start:end] if issubclass(
                     type(response), list) else response,
                 response_blobs[blobs_returned:blobs_returned + b_count] if
-                len(response_blobs) >= blobs_returned + b_count else None)
+                len(response_blobs) >= blobs_returned + b_count else None,
+                None if cmd_index_offset is None else cmd_index_offset + i)
             blobs_returned += b_count
 
     def __init__(self, db: Connector, dry_run: bool = False):
@@ -218,7 +221,7 @@ def call_response_handler(self, q: Commands, blobs: Blobs, r: CommandResponses,
         except BaseException as e:
             logger.exception(e)
 
-    def do_batch(self, db: Connector, data: List[Tuple[Commands, Blobs]]) -> None:
+    def do_batch(self, db: Connector, batch_start: int,  data: List[Tuple[Commands, Blobs]]) -> None:
         """
         Executes batch of queries and blobs in the database.
 
@@ -257,6 +260,19 @@ def process_responses(requests, input_blobs, responses, output_blobs):
                 response_handler = self.generator.response_handler
             if hasattr(self.generator, "strict_response_validation") and isinstance(self.generator.strict_response_validation, bool):
                 strict_response_validation = self.generator.strict_response_validation
+
+            # if response_handler doesn't support index, just discard the index with a wrapper.
+            if response_handler is not None:
+                parameter_count = len(inspect.signature(
+                    response_handler).parameters)
+                if parameter_count < 4 or parameter_count > 5:
+                    raise Exception("Bad Signature for response_handler :"
+                                    f"expected 6 > args > 3, got {parameter_count}")
+                if parameter_count == 4:
+                    indexless_handler = response_handler
+                    def response_handler(query, qblobs, resp, rblobs, qindex): return indexless_handler(
+                        query, qblobs, resp, rblobs)
+
             result, r, b = self.batch_command(
                 q,
                 blobs,
@@ -265,7 +281,8 @@ def process_responses(requests, input_blobs, responses, output_blobs):
                 response_handler,
                 self.commands_per_query,
                 self.blobs_per_query,
-                strict_response_validation=strict_response_validation)
+                strict_response_validation=strict_response_validation,
+                cmd_index=batch_start)
             if result == 0:
                 query_time = db.get_last_query_time()
                 worker_stats["succeeded_commands"] = len(q)
@@ -316,7 +333,8 @@ def worker(self, thid: int, generator, start: int, end: int):
             batch_end = min(batch_start + self.batchsize, end)
 
             try:
-                self.do_batch(db, generator[batch_start:batch_end])
+                self.do_batch(db, batch_start,
+                              generator[batch_start:batch_end])
             except Exception as e:
                 logger.exception(e)
                 logger.warning(
diff --git a/aperturedb/ParallelQuerySet.py b/aperturedb/ParallelQuerySet.py
@@ -49,7 +49,8 @@ def gen_execute_batch_sets(base_executor):
     #
     def execute_batch_sets(query_set, blob_set, db, success_statuses: list[int] = [0],
                            response_handler: Optional[Callable] = None, commands_per_query: list[int] = -1,
-                           blobs_per_query: list[int] = -1, strict_response_validation: bool = False):
+                           blobs_per_query: list[int] = -1,
+                           strict_response_validation: bool = False, cmd_index: int = None):
 
         logger.info("Execute Batch Sets = Batch Size {0}  Comands Per Query {1} Blobs Per Query {2}".format(
             len(query_set), commands_per_query, blobs_per_query))
@@ -297,7 +298,11 @@ def constraint_filter(single_line, single_results):
             if len(executable_queries) > 0:
                 result_code, db_results, db_blobs = base_executor(executable_queries, used_blobs,
                                                                   db, local_success_statuses,
-                                                                  None, commands_per_query[i], blobs_per_query[i], strict_response_validation=strict_response_validation)
+                                                                  None,
+                                                                  commands_per_query[i],
+                                                                  blobs_per_query[i],
+                                                                  strict_response_validation=strict_response_validation,
+                                                                  cmd_index=cmd_index)
                 if response_handler != None and db.last_query_ok():
                     def map_to_set(query, query_blobs, resp, resp_blobs):
                         response_handler(
@@ -365,7 +370,7 @@ def verify_generator(self, generator) -> bool:
         logger.error(type(generator[0]))
         return False
 
-    def do_batch(self, db: Connector, data: List[Tuple[Commands, Blobs]]) -> None:
+    def do_batch(self, db: Connector, batch_start: int,  data: List[Tuple[Commands, Blobs]]) -> None:
         """
         This is an override of ParallelQuery.do_batch.
 
@@ -387,7 +392,7 @@ def do_batch(self, db: Connector, data: List[Tuple[Commands, Blobs]]) -> None:
         self.batch_command = gen_execute_batch_sets(
             self.base_batch_command)
 
-        ParallelQuery.do_batch(self, db, data)
+        ParallelQuery.do_batch(self, db, batch_start, data)
 
     def print_stats(self) -> None:
 
diff --git a/aperturedb/Utils.py b/aperturedb/Utils.py
@@ -1,22 +1,32 @@
 """
 Miscellaneous utility functions for ApertureDB.
 """
+from aperturedb.Query import QueryBuilder
+from aperturedb.cli.configure import ls
+from aperturedb.Configuration import Configuration
+from aperturedb.ParallelQuery import execute_batch
+from aperturedb import ProgressBar
+from aperturedb.ConnectorRest import ConnectorRest
+from aperturedb.Connector import Connector
 import logging
 import json
 import os
 import importlib
 import sys
 from typing import List, Optional, Dict
 
-from graphviz import Source, Digraph
+HAS_GRAPHVIZ = True
+try:
+    from graphviz import Source, Digraph
+except:
+    HAS_GRAPHVIZ = False
+
+    class Source:
+        pass
+
+    class Digraph:
+        pass
 
-from aperturedb.Connector import Connector
-from aperturedb.ConnectorRest import ConnectorRest
-from aperturedb import ProgressBar
-from aperturedb.ParallelQuery import execute_batch
-from aperturedb.Configuration import Configuration
-from aperturedb.cli.configure import ls
-from aperturedb.Query import QueryBuilder
 
 logger = logging.getLogger(__name__)
 
@@ -213,10 +223,24 @@ def visualize_schema(self, filename: str = None, format: str = "png") -> Source:
         Returns:
             source: The visualization of the schema.
         """
+        if not HAS_GRAPHVIZ:
+            raise Exception("graphviz not installed.")
         r = self.get_schema()
 
+        colors = dict(
+            edge="#3A3B9C",
+            entity_background="#2A2E78",
+            entity_foreground="#E2E0F1",
+            property_background="#337EC0",
+            property_foreground="#E2E0F1",
+            connection_background="#5956F1",
+            connection_foreground="#E2E0F1",
+            connection_property_background="#33E1FF",
+            connection_property_foreground="#2A2E78"
+        )
+
         dot = Digraph(comment='ApertureDB Schema Diagram', node_attr={
-                      'shape': 'none', 'fontcolor': '#E2E0F1'}, graph_attr={'rankdir': 'LR'}, edge_attr={'color': '#3A3B9C'})
+                      'shape': 'none'}, graph_attr={'rankdir': 'LR'}, edge_attr={'color': colors['edge']})
 
         # Add entities as nodes and connections as edges
         entities = r['entities']['classes']
@@ -228,25 +252,25 @@ def visualize_schema(self, filename: str = None, format: str = "png") -> Source:
             properties = data["properties"]
             table = f'''<
             <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
-            <TR><TD BGCOLOR="#2A2E78" COLSPAN="3"><B>{entity}</B> ({matched:,})</TD></TR>
+            <TR><TD BGCOLOR="{colors["entity_background"]}" COLSPAN="3"><FONT COLOR="{colors["entity_foreground"]}"><B>{entity}</B> ({matched:,})</FONT></TD></TR>
             '''
             for prop, (matched, indexed, typ) in properties.items():
-                table += f'<TR><TD BGCOLOR="#337EC0"><B>{prop.strip()}</B></TD> <TD BGCOLOR="#337EC0">{matched:,}</TD> <TD BGCOLOR="#337EC0">{"Indexed" if indexed else "Unindexed"}, {typ}</TD></TR>'
+                table += f'<TR><TD BGCOLOR="{colors["property_background"]}"><FONT COLOR="{colors["property_foreground"]}"><B>{prop.strip()}</B></FONT></TD> <TD BGCOLOR="{colors["property_background"]}"><FONT COLOR="{colors["property_foreground"]}">{matched:,}</FONT></TD> <TD BGCOLOR="{colors["property_background"]}"><FONT COLOR="{colors["property_foreground"]}">{"Indexed" if indexed else "Unindexed"}, {typ}</FONT></TD></TR>'
             for connection, data in connections.items():
                 if data['src'] == entity:
                     matched = data["matched"]
                     # dictionary from name to (matched, indexed, type)
                     properties = data["properties"]
-                    table += f'<TR><TD BGCOLOR="#5956F1" COLSPAN="3" PORT="{connection}"><B>{connection}</B> ({matched:,})</TD></TR>'
+                    table += f'<TR><TD BGCOLOR="{colors["connection_background"]}" COLSPAN="3" PORT="{connection}"><FONT COLOR="{colors["connection_foreground"]}"><B>{connection}</B> ({matched:,})</FONT></TD></TR>'
                     if properties:
                         for prop, (matched, indexed, typ) in properties.items():
-                            table += f'<TR><TD BGCOLOR="#33E1FF"><B>{prop.strip()}</B></TD> <TD BGCOLOR="#33E1FF">{matched:,}</TD> <TD BGCOLOR="#33E1FF">{"Indexed" if indexed else "Unindexed"}, {typ}</TD></TR>'
+                            table += f'<TR><TD BGCOLOR="{colors["connection_property_background"]}"><FONT COLOR="{colors["connection_property_foreground"]}"><B>{prop.strip()}</B></FONT></TD> <TD BGCOLOR="{colors["connection_property_background"]}"><FONT COLOR="{colors["connection_property_foreground"]}">{matched:,}</FONT></TD> <TD BGCOLOR="{colors["connection_property_background"]}"><FONT COLOR="{colors["connection_property_foreground"]}">{"Indexed" if indexed else "Unindexed"}, {typ}</FONT></TD></TR>'
             table += '</TABLE>>'
             dot.node(entity, label=table)
 
         for connection, data in connections.items():
             dot.edge(f'{data["src"]}:{connection}',
-                     f'{data["dst"]}:{connection}')
+                     f'{data["dst"]}')
 
         # Render the diagram inline
         s = Source(dot.source, filename="schema_diagram.gv", format="png")
diff --git a/aperturedb/__init__.py b/aperturedb/__init__.py
diff --git a/test/test_ResponseHandler.py b/test/test_ResponseHandler.py