Merge pull request #4671 from opsmill/lgu-refactor-query-analyzer

LucasG0 · web-flow · commit 195cec0e90eb · 2024-10-21T11:23:00.000+02:00
Refactor QueryAnalyzer to GraphProfileGenerator for query benchmarks
diff --git a/backend/tests/helpers/query_benchmark/data_generator.py b/backend/tests/helpers/query_benchmark/data_generator.py
@@ -1,13 +1,13 @@
 from abc import abstractmethod
 from pathlib import Path
-from typing import Callable
+from typing import Callable, Optional
 
 from rich.console import Console
 from rich.progress import Progress
 
 from tests.helpers.query_benchmark.db_query_profiler import (
+    GraphProfileGenerator,
     InfrahubDatabaseProfiler,
-    ProfilerEnabler,
 )
 
 
@@ -36,7 +36,8 @@ async def load_data_and_profile(
     profile_frequency: int,
     graphs_output_location: Path,
     test_label: str,
-    memory_profiling_rate: int = 25,
+    graph_generator: GraphProfileGenerator,
+    memory_profiling_rate: Optional[int] = None,
 ) -> None:
     """
     Loads data using the provided data generator, profiles the execution at specified loading intervals,
@@ -59,7 +60,7 @@ async def load_data_and_profile(
     q, r = divmod(nb_elements, profile_frequency)
     nb_elem_per_batch = [profile_frequency] * q + ([r] if r else [])
 
-    query_analyzer = data_generator.db.query_analyzer
+    db_profiling_queries = data_generator.db
 
     with Progress(console=Console(force_terminal=True)) as progress:  # Need force_terminal to display with pytest
         task = progress.add_task(
@@ -68,12 +69,14 @@ async def load_data_and_profile(
 
         for i, nb_elem_to_load in enumerate(nb_elem_per_batch):
             await data_generator.load_data(nb_elements=nb_elem_to_load)
-            query_analyzer.increase_nb_elements_loaded(profile_frequency)
+            db_profiling_queries.increase_nb_elements_loaded(nb_elem_to_load)
             profile_memory = i % memory_profiling_rate == 0 if memory_profiling_rate is not None else False
-            with ProfilerEnabler(profile_memory=profile_memory, query_analyzer=query_analyzer):
+            with db_profiling_queries.profile(profile_memory):
                 await func_call()
             progress.advance(task)
 
         # Remove first measurements as queries when there is no data seem always extreme
-        query_analyzer.measurements = [m for m in query_analyzer.measurements if m.nb_elements_loaded != 0]
-        query_analyzer.create_graphs(output_location=graphs_output_location, label=test_label)
+        measurements = [m for m in db_profiling_queries.measurements if m.nb_elements_loaded != 0]
+        graph_generator.create_graphs(
+            measurements=measurements, output_location=graphs_output_location, label=test_label
+        )
diff --git a/backend/tests/helpers/query_benchmark/db_query_profiler.py b/backend/tests/helpers/query_benchmark/db_query_profiler.py
@@ -2,7 +2,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Optional, Type
+from typing import Any, List, Optional, Self, Type
 
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -38,53 +38,27 @@ class QueryMeasurement:
     memory: Optional[float] = None
 
 
-class QueryAnalyzer:
-    name: Optional[str]
-    measurements: list[QueryMeasurement]
-    output_location: Path
-    nb_elements_loaded: int
-    profile_memory: bool
-    profile_duration: bool
-
-    def __init__(self) -> None:
-        self.reset()
-
-    def reset(self) -> None:
-        self.name = None
-        self.measurements = []
-        self.output_location = Path.cwd()
-        self.nb_elements_loaded = 0
-        self.profile_duration = False
-        self.profile_memory = False
-
-    def increase_nb_elements_loaded(self, increment: int) -> None:
-        self.nb_elements_loaded += increment
-
-    def get_df(self) -> pd.DataFrame:
+class GraphProfileGenerator:
+    def build_df_from_measuremenst(self, measurements: list[QueryMeasurement]) -> pd.DataFrame:
         data = {}
         for item in QueryMeasurement.__dataclass_fields__.keys():
-            data[item] = [getattr(m, item) for m in self.measurements]
+            data[item] = [getattr(m, item) for m in measurements]
 
         return pd.DataFrame(data)
 
-    def add_measurement(self, measurement: QueryMeasurement) -> None:
-        measurement.nb_elements_loaded = self.nb_elements_loaded
-        self.measurements.append(measurement)
-
-    def create_graphs(self, output_location: Path, label: str) -> None:
-        df = self.get_df()
+    def create_graphs(self, measurements: List[QueryMeasurement], output_location: Path, label: str) -> None:
+        df = self.build_df_from_measuremenst(measurements)
         query_names = set(df["query_name"].tolist())
 
         if not output_location.exists():
             output_location.mkdir(parents=True)
 
         for query_name in query_names:
-            self.create_duration_graph(query_name=query_name, label=label, output_dir=output_location)
+            self.create_duration_graph(df=df, query_name=query_name, label=label, output_dir=output_location)
             # self.create_memory_graph(query_name=query_name, label=label, output_dir=output_location)
 
-    def create_duration_graph(self, query_name: str, label: str, output_dir: Path) -> None:
+    def create_duration_graph(self, df: pd.DataFrame, query_name: str, label: str, output_dir: Path) -> None:
         metric = "duration"
-        df = self.get_df()
 
         name = f"{query_name}_{metric}"
         plt.figure(name)
@@ -105,71 +79,45 @@ def create_duration_graph(self, query_name: str, label: str, output_dir: Path) -
         file_name = f"{name}.png"
         plt.savefig(str(output_dir / file_name), bbox_inches="tight")
 
-    def create_memory_graph(self, query_name: str, label: str, output_dir: Path) -> None:
-        metric = "memory"
-        df = self.get_df()
-        df_query = df[(df["query_name"] == query_name) & (~df["memory"].isna())]
-
-        plt.figure(query_name)
-
-        x = df_query["nb_elements_loaded"].values
-        y = df_query[metric].values
-
-        plt.plot(x, y, label=label)
-
-        plt.legend()
-
-        plt.ylabel("memory", fontsize=15)
-        plt.title(f"Query - {query_name} | {metric}", fontsize=20)
-
-        file_name = f"{query_name}_{metric}.png"
 
-        plt.savefig(str(output_dir / file_name))
-
-
-class ProfilerEnabler:
+class InfrahubDatabaseProfiler(InfrahubDatabase):
+    profiling_enabled: bool
     profile_memory: bool
+    measurements: List[QueryMeasurement]
+    nb_elements_loaded: int
 
-    def __init__(self, profile_memory: bool, query_analyzer: QueryAnalyzer) -> None:
-        self.profile_memory = profile_memory
-        self.query_analyzer = query_analyzer
-
-    def __enter__(self) -> None:
-        self.query_analyzer.profile_duration = True
-        self.query_analyzer.profile_memory = self.profile_memory
-
-    def __exit__(
-        self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
-    ) -> None:
-        self.query_analyzer.profile_duration = False
-        self.query_analyzer.profile_memory = False
-
-
-# Tricky to have it as an attribute of InfrahubDatabaseProfiler as some copies of InfrahubDatabase are made
-# during start_session calls.
-# query_analyzer = QueryAnalyzer()
-
-
-class InfrahubDatabaseProfiler(InfrahubDatabase):
-    def __init__(self, **kwargs: Any) -> None:
+    def __init__(
+        self,
+        profiling_enabled: bool = False,
+        profile_memory: bool = False,
+        measurements: Optional[List[QueryMeasurement]] = None,
+        nb_elements_loaded: int = 0,
+        **kwargs: Any,
+    ) -> None:  # todo args in constructor only because of __class__ pattern
         super().__init__(**kwargs)
-        self.query_analyzer = QueryAnalyzer()
+        self.profiling_enabled = profiling_enabled
+        self.profile_memory = profile_memory
+        self.measurements = measurements if measurements is not None else []
+        self.nb_elements_loaded = nb_elements_loaded
         # Note that any attribute added here should be added to get_context method.
 
     def get_context(self) -> dict[str, Any]:
         ctx = super().get_context()
-        ctx["query_analyzer"] = self.query_analyzer
+        ctx["profiling_enabled"] = self.profiling_enabled
+        ctx["profile_memory"] = self.profile_memory
+        ctx["measurements"] = self.measurements
+        ctx["nb_elements_loaded"] = self.nb_elements_loaded
         return ctx
 
     async def execute_query_with_metadata(
         self, query: str, params: dict[str, Any] | None = None, name: str | None = "undefined"
     ) -> tuple[list[Record], dict[str, Any]]:
-        if not self.query_analyzer.profile_duration:
+        if not self.profiling_enabled:
             # Profiling might be disabled to avoid capturing queries while loading data
             return await super().execute_query_with_metadata(query, params, name)
 
         # We don't want to memory profile all queries
-        if self.query_analyzer.profile_memory and name in self.queries_names_to_config:
+        if self.profile_memory and name in self.queries_names_to_config:
             # Following call to super().execute_query_with_metadata() will use this value to set PROFILE option
             self.queries_names_to_config[name].profile_memory = True
             profile_memory = True
@@ -190,7 +138,34 @@ async def execute_query_with_metadata(
             memory=metadata["profile"]["args"]["GlobalMemory"] if profile_memory else None,
             query_name=str(name),
             start_time=time_start,
+            nb_elements_loaded=self.nb_elements_loaded,
         )
-        self.query_analyzer.add_measurement(measurement)
+        self.measurements.append(measurement)
 
         return response, metadata
+
+    def profile(self, profile_memory: bool) -> Self:
+        """
+        This method allows to enable profiling of a InfrahubDatabaseProfiler instance
+        through a context manager with this syntax:
+
+        `with db.profile(profile_memory=...):
+            # run code to profile
+        `
+        """
+
+        self.profile_memory = profile_memory
+        return self
+
+    def __enter__(self) -> None:
+        self.profiling_enabled = True
+        self.profile_memory = self.profile_memory
+
+    def __exit__(
+        self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
+    ) -> None:
+        self.profiling_enabled = False
+        self.profile_memory = False
+
+    def increase_nb_elements_loaded(self, nb_elements_loaded: int) -> None:
+        self.nb_elements_loaded += nb_elements_loaded
diff --git a/backend/tests/query_benchmark/conftest.py b/backend/tests/query_benchmark/conftest.py
@@ -5,6 +5,7 @@
 
 from infrahub.core.constants import BranchSupportType
 from infrahub.core.schema import SchemaRoot
+from tests.helpers.query_benchmark.db_query_profiler import GraphProfileGenerator
 
 RESULTS_FOLDER = Path(__file__).resolve().parent / "query_performance_results"
 
@@ -54,10 +55,19 @@ async def car_person_schema_root() -> SchemaRoot:
                 ],
                 "relationships": [
                     {"name": "cars", "peer": "TestCar", "cardinality": "many"},
-                    {"name": "animal", "peer": "TestAnimal", "cardinality": "one"},
                 ],
             },
         ],
     }
 
     return SchemaRoot(**schema)
+
+
+@pytest.fixture(scope="session")
+async def graph_generator() -> GraphProfileGenerator:
+    """
+    Use GraphProfileGenerator as a fixture as it may allow to properly generate graphs from
+    distinct tests, instead of having each test managing its own display.
+    """
+
+    return GraphProfileGenerator()
diff --git a/backend/tests/query_benchmark/test_node_unique_attribute_constraint.py b/backend/tests/query_benchmark/test_node_unique_attribute_constraint.py
@@ -18,7 +18,7 @@
     CarGeneratorWithOwnerHavingUniqueCar,
 )
 from tests.helpers.query_benchmark.data_generator import load_data_and_profile
-from tests.helpers.query_benchmark.db_query_profiler import BenchmarkConfig
+from tests.helpers.query_benchmark.db_query_profiler import BenchmarkConfig, GraphProfileGenerator
 from tests.query_benchmark.conftest import RESULTS_FOLDER
 from tests.query_benchmark.utils import start_db_and_create_default_branch
 
@@ -28,7 +28,12 @@
 
 
 async def benchmark_uniqueness_query(
-    query_request, car_person_schema_root, benchmark_config: BenchmarkConfig, test_params_label: str, test_name: str
+    query_request,
+    car_person_schema_root,
+    graph_generator: GraphProfileGenerator,
+    benchmark_config: BenchmarkConfig,
+    test_params_label: str,
+    test_name: str,
 ):
     """
     Profile NodeUniqueAttributeConstraintQuery with a given query_request / configuration, using a Car generator.
@@ -68,6 +73,7 @@ async def init_and_execute():
         nb_elements=nb_cars,
         graphs_output_location=graph_output_location,
         test_label=test_params_label,
+        graph_generator=graph_generator,
     )
 
 
@@ -96,14 +102,15 @@ async def init_and_execute():
         ),
     ],
 )
-async def test_multiple_constraints(query_request, car_person_schema_root):
+async def test_multiple_constraints(query_request, car_person_schema_root, graph_generator):
     benchmark_config = BenchmarkConfig(neo4j_runtime=Neo4jRuntime.DEFAULT, neo4j_image=NEO4J_ENTERPRISE_IMAGE)
     await benchmark_uniqueness_query(
         query_request=query_request,
         car_person_schema_root=car_person_schema_root,
         benchmark_config=benchmark_config,
         test_params_label=str(query_request),
         test_name=inspect.currentframe().f_code.co_name,
+        graph_generator=graph_generator,
     )
 
 
@@ -115,7 +122,7 @@ async def test_multiple_constraints(query_request, car_person_schema_root):
         BenchmarkConfig(neo4j_runtime=Neo4jRuntime.PARALLEL, neo4j_image=NEO4J_ENTERPRISE_IMAGE),
     ],
 )
-async def test_multiple_runtimes(benchmark_config, car_person_schema_root):
+async def test_multiple_runtimes(benchmark_config, car_person_schema_root, graph_generator):
     query_request = NodeUniquenessQueryRequest(
         kind="TestCar",
         unique_attribute_paths={
@@ -133,17 +140,18 @@ async def test_multiple_runtimes(benchmark_config, car_person_schema_root):
         benchmark_config=benchmark_config,
         test_params_label=str(benchmark_config),
         test_name=inspect.currentframe().f_code.co_name,
+        graph_generator=graph_generator,
     )
 
 
 @pytest.mark.parametrize(
     "benchmark_config",
     [
         BenchmarkConfig(neo4j_runtime=Neo4jRuntime.PARALLEL, neo4j_image=NEO4J_ENTERPRISE_IMAGE, load_db_indexes=False),
-        BenchmarkConfig(neo4j_runtime=Neo4jRuntime.PARALLEL, neo4j_image=NEO4J_ENTERPRISE_IMAGE, load_db_indexes=True),
+        # BenchmarkConfig(neo4j_runtime=Neo4jRuntime.PARALLEL, neo4j_image=NEO4J_ENTERPRISE_IMAGE, load_db_indexes=True),
     ],
 )
-async def test_indexes(benchmark_config, car_person_schema_root):
+async def test_indexes(benchmark_config, car_person_schema_root, graph_generator):
     query_request = NodeUniquenessQueryRequest(
         kind="TestCar",
         unique_attribute_paths={
@@ -161,4 +169,5 @@ async def test_indexes(benchmark_config, car_person_schema_root):
         benchmark_config=benchmark_config,
         test_params_label=str(benchmark_config),
         test_name=inspect.currentframe().f_code.co_name,
+        graph_generator=graph_generator,
     )