Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
e006938
add visualization dependencies
stephprince Sep 18, 2025
9a9cd78
refactor dataclasses, parquet processing
stephprince Sep 18, 2025
994b1b4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 18, 2025
deeb6fb
add figure script - wip
stephprince Sep 24, 2025
f9f0575
update plotting functions
stephprince Sep 25, 2025
c98bcec
update minimum version setting for parquet
stephprince Sep 25, 2025
fec918f
save timestamp info in parquet
stephprince Sep 25, 2025
475e7f4
Merge branch 'main' into add-figure-script
stephprince Sep 25, 2025
eb74de6
remove accidentally tracked figures
stephprince Sep 25, 2025
b9742f0
add slice vs time plots
stephprince Sep 25, 2025
7f50a75
update figure script
stephprince Sep 25, 2025
6b4a4f6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 25, 2025
6d5dc1f
save figures as pdfs with editable text
stephprince Sep 25, 2025
e077e9f
add scatter plots, combine plots, pull out preloaded
stephprince Sep 26, 2025
c272e31
update figure script
stephprince Sep 26, 2025
fac69e8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 26, 2025
394b397
update figures from feedback
stephprince Sep 30, 2025
44e74b7
refactor db processing and visualization code
stephprince Sep 30, 2025
1370885
refactor db processing and visualization code
stephprince Sep 30, 2025
b480d2f
update figure generation script to use classes
stephprince Sep 30, 2025
1f54780
add print logging for plotting
stephprince Sep 30, 2025
b2ea98e
ignore pdf figure files
stephprince Sep 30, 2025
60535e7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 30, 2025
b00e126
update cache/no cache order
stephprince Sep 30, 2025
2e3ab14
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 30, 2025
d6f9410
Merge branch 'main' into add-figure-script
CodyCBakerPhD Oct 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,6 @@ fabric.properties

# Spyder
.spyproject/*

# Figures
*.pdf
6 changes: 5 additions & 1 deletion src/nwb_benchmarks/database/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
"""Exposed imports to the `database` submodule."""

from ._models import Environment, Machine, Result, Results
from ._processing import (
from ._parquet import (
concat_dataclasses_to_parquet,
repackage_as_parquet,
)
from ._processing import BenchmarkDatabase
from ._visualization import BenchmarkVisualizer

__all__ = [
"Machine",
"Result",
"Results",
"Environment",
"BenchmarkDatabase",
"BenchmarkVisualizer",
"concat_dataclasses_to_parquet",
"repackage_as_parquet",
]
74 changes: 49 additions & 25 deletions src/nwb_benchmarks/database/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pathlib
import re
import uuid
from datetime import datetime

import packaging.version
import typing_extensions
Expand All @@ -13,7 +14,7 @@
class Result:
uuid: str
version: str
timestamp: str
timestamp: datetime
commit_hash: str
environment_id: str
machine_id: str
Expand All @@ -27,6 +28,49 @@ class Result:
class Results:
results: list[Result]

@staticmethod
def normalize_time_and_network_results(benchmark_results) -> dict:
"""Convert benchmark results to a consistent dict format with list values."""

def process_network_results(benchmark_results: dict) -> dict:
"""Add additional network metrics."""
results = benchmark_results.copy()
if results["total_transfer_time_in_seconds"] != 0:
results["percent_network_time"] = (
results["network_total_time_in_seconds"] / results["total_transfer_time_in_seconds"]
)
else:
results["percent_network_time"] = float("nan")

if results["total_traffic_in_number_of_web_packets"] != 0:
results["mean_time_per_web_packet"] = (
results["total_transfer_time_in_seconds"] / results["total_traffic_in_number_of_web_packets"]
)
else:
results["mean_time_per_web_packet"] = float("nan")

return results

if isinstance(benchmark_results, dict):
value_dict = process_network_results(benchmark_results)
else:
value_dict = dict(time=benchmark_results)

# Ensure all values are lists
return {k: v if isinstance(v, list) else [float(v)] for k, v in value_dict.items()}

@staticmethod
def parse_parameter_case(s):
# replace any slice(...) with "slice(...)" for safe parsing
modified_s = re.sub(r"slice\([^)]+\)", r'"\g<0>"', s)
output = ast.literal_eval(modified_s)

# if the parsed string is not a dict (older benchmarks results), convert it to one
if not isinstance(output, dict):
output = {"https_url": output[0].strip("'")}

return output

@classmethod
def safe_load_from_json(cls, file_path: pathlib.Path) -> typing_extensions.Self | None:
with file_path.open(mode="r") as file_stream:
Expand All @@ -43,43 +87,22 @@ def safe_load_from_json(cls, file_path: pathlib.Path) -> typing_extensions.Self
environment_id = data["environment_id"]
machine_id = data["machine_id"]

def normalize_time_and_network_results(benchmark_results) -> dict:
"""Convert benchmark results to a consistent dict format with list values."""
if isinstance(benchmark_results, dict):
value_dict = benchmark_results
else:
value_dict = dict(time=benchmark_results)

# Ensure all values are lists
return {k: v if isinstance(v, list) else [float(v)] for k, v in value_dict.items()}

def parse_parameter_case(s):
# replace any slice(...) with "slice(...)" for safe parsing
modified_s = re.sub(r"slice\([^)]+\)", r'"\g<0>"', s)
output = ast.literal_eval(modified_s)

# if the parsed string is not a dict (older benchmarks results), convert it to one
if not isinstance(output, dict):
output = {"https_url": output[0].strip("'")}

return output

results = [
Result(
uuid=str(uuid.uuid4()), # TODO: add this to each results file so it is persistent
version=database_version,
timestamp=timestamp,
timestamp=datetime.strptime(timestamp, "%Y-%m-%d-%H-%M-%S"),
commit_hash=commit_hash,
environment_id=environment_id,
machine_id=machine_id,
benchmark_name=benchmark_name,
parameter_case=parse_parameter_case(parameter_case),
parameter_case=cls.parse_parameter_case(parameter_case),
value=value,
variable=variable_name,
)
for benchmark_name, parameter_cases in data["results"].items()
for parameter_case, benchmark_results in parameter_cases.items()
for variable_name, values in normalize_time_and_network_results(benchmark_results).items()
for variable_name, values in cls.normalize_time_and_network_results(benchmark_results).items()
for value in values
]

Expand All @@ -94,6 +117,7 @@ def to_dataframe(self) -> "polars.DataFrame":
"commit_hash": [result.commit_hash for result in self.results],
"environment_id": [result.environment_id for result in self.results],
"machine_id": [result.machine_id for result in self.results],
"timestamp": [result.timestamp for result in self.results],
"benchmark_name": [result.benchmark_name for result in self.results],
"parameter_case_name": [result.parameter_case.get("name") for result in self.results],
"parameter_case_https_url": [result.parameter_case.get("https_url") for result in self.results],
Expand Down
95 changes: 95 additions & 0 deletions src/nwb_benchmarks/database/_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import dataclasses
import pathlib

import packaging
import polars

from ._models import Environment, Machine, Results


def concat_dataclasses_to_parquet(
directory: pathlib.Path,
output_directory: pathlib.Path,
dataclass_name: str,
dataclass: dataclasses.dataclass,
concat_how: str = "diagonal_relaxed",
minimum_version: str = "1.0.0",
) -> None:
"""Generic function to process any data type (machines, environments, results)

Args:
directory (pathlib.Path): Path to the root directory containing data subdirectories.
output_directory (pathlib.Path): Path to the output directory where the parquet file will be saved.
dataclass_name (str): Name of the data class, used for input and output filenames.
dataclass: The dataclass type to process (Machine, Environment, Results).
concat_how (str, optional): How to concatenate dataframes. Defaults to "diagonal_relaxed".
minimum_version (str, optional): Minimum version of the database to include. Defaults to "1.0.0".
Returns:

"""

data_frames = []
data_directory = directory / dataclass_name

for file_path in data_directory.iterdir():
obj = dataclass.safe_load_from_json(file_path=file_path)

if obj is None:
continue

data_frame = obj.to_dataframe()

# filter by minimum version (before concatenation to avoid issues with different results structures)
# TODO - should environment have a version?
if "version" in data_frame.columns:
data_frame = data_frame.filter(
polars.col("version").map_elements(
lambda x: packaging.version.parse(x) >= packaging.version.parse(minimum_version),
return_dtype=polars.Boolean,
)
)

data_frames.append(data_frame)

if data_frames:
database = polars.concat(items=data_frames, how=concat_how)
output_file_path = output_directory / f"{dataclass_name}.parquet"
database.write_parquet(file=output_file_path)


def repackage_as_parquet(
directory: pathlib.Path,
output_directory: pathlib.Path,
minimum_results_version: str = "1.0.0",
minimum_machines_version: str = "1.0.0",
) -> None:
"""Repackage JSON results files as parquet databases for easier querying."""

# Machines
concat_dataclasses_to_parquet(
directory=directory,
output_directory=output_directory,
dataclass_name="machines",
dataclass=Machine,
concat_how="diagonal_relaxed",
minimum_version=minimum_machines_version,
)

# Environments
concat_dataclasses_to_parquet(
directory=directory,
output_directory=output_directory,
dataclass_name="environments",
dataclass=Environment,
concat_how="diagonal",
)

# Results
concat_dataclasses_to_parquet(
directory=directory,
output_directory=output_directory,
dataclass_name="results",
dataclass=Results,
concat_how="diagonal_relaxed",
minimum_version=minimum_results_version,
)
Loading
Loading