Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions assets/clickhouse/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"""

CLICKHOUSE_COLLECTION_NAME = "clickhouse_bench"
class clickhouse_native_json_bench(Benchmark):
class clickhouse_bench(Benchmark):
# add any parameters to the tool here
def __init__(self, dataset, manual_column_names=True, keys=[], additional_order_by=[], timestamp_key=False):
super().__init__(dataset)
Expand Down Expand Up @@ -154,7 +154,7 @@ def run_applicable(self, dataset_name):


def main():
bench = clickhouse_native_json_bench(sys.argv[1])
bench = clickhouse_bench(sys.argv[1])
bench.run_everything()

if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions assets/clp/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

CLP_OUT_PATH = f"{WORK_DIR}/archives"
CLP_S_BINARY = "/clp/clp-s"
class clp_s_bench(Benchmark):
class clp_bench(Benchmark):
def __init__(self, dataset, target_encoded_size=268435456):
super().__init__(dataset)

Expand Down Expand Up @@ -61,7 +61,7 @@ def terminate_procs(self):


def main():
bench = clp_s_bench(sys.argv[1])
bench = clp_bench(sys.argv[1])
bench.run_everything()

if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions assets/presto_clp/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
CLP_PRESTO_HOST_STORAGE = os.path.abspath(os.path.expanduser("~/clp-json-x86_64-v0.4.0-dev"))
SQL_PASSWORD = "wqEGPyBdx_w"
HOST_IP = "127.0.0.1"
class clp_presto_bench(Benchmark):
class presto_clp_bench(Benchmark):
# add any parameters to the tool here
def __init__(self, dataset, dataset_variation='cleaned_log'):
super().__init__(dataset, dataset_variation=dataset_variation)
Expand Down Expand Up @@ -102,7 +102,7 @@ def terminate(self):


def main():
bench = clp_presto_bench(sys.argv[1])
bench = presto_clp_bench(sys.argv[1])
bench.run_everything()

if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions assets/presto_parquet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
PARQUET_DATA_PATH = "/home/hive-data"
PARQUET_SCHEMA_NAME = "bench_schema"
PARQUET_TABLE_NAME = "bench_table"
class parquet_bench(Benchmark):
class presto_parquet_bench(Benchmark):
# add any parameters to the tool here
def __init__(self, dataset, mode='json string'):
super().__init__(dataset)
Expand Down Expand Up @@ -145,7 +145,7 @@ def terminate(self):
time.sleep(10)

def main():
bench = parquet_bench(sys.argv[1])
bench = presto_parquet_bench(sys.argv[1])
bench.run_everything()

if __name__ == "__main__":
Expand Down
26 changes: 13 additions & 13 deletions scripts/benchall.py
Copy link
Member

@kirkrodrigues kirkrodrigues Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a reference to clp_presto in a comment on line 60.

Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/usr/bin/env python3

from assets.clp_s.main import clp_s_bench
from assets.clickhouse_native_json.main import clickhouse_native_json_bench
from assets.clp.main import clp_bench
from assets.clickhouse.main import clickhouse_bench
from assets.sparksql.main import sparksql_bench
from assets.parquet.main import parquet_bench
from assets.presto_parquet.main import presto_parquet_bench
from assets.zstandard.main import zstandard_bench
from assets.elasticsearch.main import elasticsearch_bench
from assets.clp_presto.main import clp_presto_bench
from assets.presto_clp.main import presto_clp_bench
from assets.overhead_test.main import overhead_test_bench
from assets.gzip.main import gzip_bench
from src.jsonsync import JsonItem
Expand All @@ -33,31 +33,30 @@ def get_target_from_name(name):


benchmarks = [ # benchmark object, arguments
(clp_s_bench, {}),
(clickhouse_native_json_bench, {
(clp_bench, {}),
(clickhouse_bench, {
'manual_column_names': False,
'keys': [],
'additional_order_by': [],
'timestamp_key': True
}),
(clp_presto_bench, {
(presto_clp_bench, {
'dataset_variation': "cleaned_log"
}),
Comment on lines +36 to 45
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick (assertive)

LGTM! Benchmark list updated correctly.

The benchmarks list has been properly updated to use the new class names, maintaining the same configuration parameters.

Consider adding a trailing comma after the dictionary on line 44 for consistency:

        (presto_clp_bench, {
            'dataset_variation': "cleaned_log"
-            }),
+            }),

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 Ruff (0.12.2)

41-41: Trailing comma missing

Add trailing comma

(COM812)


44-44: Trailing comma missing

Add trailing comma

(COM812)

🤖 Prompt for AI Agents
In scripts/benchall.py around lines 36 to 45, add a trailing comma after the
dictionary on line 44 to maintain consistency in the list formatting. This means
placing a comma after the closing brace of the dictionary assigned to
'dataset_variation' in the presto_clp_bench tuple.

(parquet_bench, {'mode': 'json string'}),
(parquet_bench, {'mode': 'pairwise arrays'}),
(presto_parquet_bench, {'mode': 'json string'}),
(presto_parquet_bench, {'mode': 'pairwise arrays'}),
(elasticsearch_bench, {}),
(overhead_test_bench, {}),
(zstandard_bench, {}),
(sparksql_bench, {}),
(gzip_bench, {}),
]

def run(bencher, kwargs, bench_target, attach=False):
def run(bencher, kwargs, bench_target, attach=False, attach_on_error=False):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add type annotations and consider parameter naming.

The function signature has been extended appropriately with the attach_on_error parameter. However, several improvements could be made for code quality.

Apply this diff to add type annotations and improve the function signature:

-def run(bencher, kwargs, bench_target, attach=False, attach_on_error=False):
+def run(bencher: type, kwargs: dict, bench_target: Path, attach: bool = False, attach_on_error: bool = False) -> None:

You'll need to add this import at the top of the file:

+from typing import Optional

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 Ruff (0.12.2)

55-55: Missing return type annotation for public function run

Add return type annotation: None

(ANN201)


55-55: Missing type annotation for function argument bencher

(ANN001)


55-55: Missing type annotation for function argument kwargs

(ANN001)


55-55: Missing type annotation for function argument bench_target

(ANN001)


55-55: Boolean default positional argument in function definition

(FBT002)


55-55: Missing type annotation for function argument attach

(ANN001)


55-55: Boolean default positional argument in function definition

(FBT002)


55-55: Missing type annotation for function argument attach_on_error

(ANN001)

🤖 Prompt for AI Agents
In scripts/benchall.py at line 55, the function run lacks type annotations and
could benefit from clearer parameter naming. Add appropriate type annotations to
the function signature for all parameters, including bencher, kwargs,
bench_target, attach, and attach_on_error. Also, rename parameters if needed to
improve clarity and maintain consistency. Import any necessary typing modules at
the top of the file to support these annotations.

dataset_name = 'error when finding dataset name'
bench = None
try:
dataset_name = os.path.basename(bench_target.resolve()).strip()
# benchmark clp_presto on the cleaned (no spaces) datasets

print(f'Benchmarking {bencher.__name__} ({kwargs}) on dataset {dataset_name}')

Expand All @@ -70,7 +69,7 @@ def run(bencher, kwargs, bench_target, attach=False):
with open((current_dir / 'exceptions.log').resolve(), 'a') as file:
file.write(f"{statement}\n")
print(statement)
if attach:
if attach or attach_on_error:
if bench is not None:
bench.docker_attach()
else:
Expand All @@ -82,7 +81,8 @@ def run(bencher, kwargs, bench_target, attach=False):

#if dataset_name != 'mongod': # only use mongod for now
# continue
run(bencher, kwargs, bench_target)
#run(bencher, kwargs, bench_target)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we delete this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a change for it but I can't cherry pick it from the initial commit, and it doesn't fit in the scope of this PR. I'll open a separate PR for it when these get merged (or I can put it here).

run(bencher, kwargs, bench_target, attach_on_error=True)
#run(bencher, kwargs, bench_target, attach=True)

#run(sparksql_bench, {}, get_target_from_name('mongod'))