From d6d1ceb62d116e7449faa2a0f0b6754d41c3d717 Mon Sep 17 00:00:00 2001 From: John Hao Date: Thu, 24 Jul 2025 17:47:14 -0400 Subject: [PATCH 1/2] fix assets renaming --- assets/clickhouse/main.py | 4 ++-- assets/clp/main.py | 4 ++-- assets/presto_clp/main.py | 4 ++-- scripts/benchall.py | 25 +++++++++++++------------ 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/assets/clickhouse/main.py b/assets/clickhouse/main.py index 75bce8c..fdf55b6 100755 --- a/assets/clickhouse/main.py +++ b/assets/clickhouse/main.py @@ -13,7 +13,7 @@ """ CLICKHOUSE_COLLECTION_NAME = "clickhouse_bench" -class clickhouse_native_json_bench(Benchmark): +class clickhouse_bench(Benchmark): # add any parameters to the tool here def __init__(self, dataset, manual_column_names=True, keys=[], additional_order_by=[], timestamp_key=False): super().__init__(dataset) @@ -154,7 +154,7 @@ def run_applicable(self, dataset_name): def main(): - bench = clickhouse_native_json_bench(sys.argv[1]) + bench = clickhouse_bench(sys.argv[1]) bench.run_everything() if __name__ == "__main__": diff --git a/assets/clp/main.py b/assets/clp/main.py index 7ad36e6..0e148a0 100755 --- a/assets/clp/main.py +++ b/assets/clp/main.py @@ -5,7 +5,7 @@ CLP_OUT_PATH = f"{WORK_DIR}/archives" CLP_S_BINARY = "/clp/clp-s" -class clp_s_bench(Benchmark): +class clp_bench(Benchmark): def __init__(self, dataset, target_encoded_size=268435456): super().__init__(dataset) @@ -61,7 +61,7 @@ def terminate_procs(self): def main(): - bench = clp_s_bench(sys.argv[1]) + bench = clp_bench(sys.argv[1]) bench.run_everything() if __name__ == "__main__": diff --git a/assets/presto_clp/main.py b/assets/presto_clp/main.py index 487ae6b..4a5255c 100755 --- a/assets/presto_clp/main.py +++ b/assets/presto_clp/main.py @@ -16,7 +16,7 @@ CLP_PRESTO_HOST_STORAGE = os.path.abspath(os.path.expanduser("~/clp-json-x86_64-v0.4.0-dev")) SQL_PASSWORD = "wqEGPyBdx_w" HOST_IP = "127.0.0.1" -class clp_presto_bench(Benchmark): +class presto_clp_bench(Benchmark): # add any parameters to the tool here def __init__(self, dataset, dataset_variation='cleaned_log'): super().__init__(dataset, dataset_variation=dataset_variation) @@ -102,7 +102,7 @@ def terminate(self): def main(): - bench = clp_presto_bench(sys.argv[1]) + bench = presto_clp_bench(sys.argv[1]) bench.run_everything() if __name__ == "__main__": diff --git a/scripts/benchall.py b/scripts/benchall.py index 6854f17..8b46e83 100755 --- a/scripts/benchall.py +++ b/scripts/benchall.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 -from assets.clp_s.main import clp_s_bench -from assets.clickhouse_native_json.main import clickhouse_native_json_bench +from assets.clp.main import clp_bench +from assets.clickhouse.main import clickhouse_bench from assets.sparksql.main import sparksql_bench -from assets.parquet.main import parquet_bench +from assets.presto_parquet.main import parquet_bench from assets.zstandard.main import zstandard_bench from assets.elasticsearch.main import elasticsearch_bench -from assets.clp_presto.main import clp_presto_bench +from assets.presto_clp.main import presto_clp_bench from assets.overhead_test.main import overhead_test_bench from assets.gzip.main import gzip_bench from src.jsonsync import JsonItem @@ -33,14 +33,14 @@ def get_target_from_name(name): benchmarks = [ # benchmark object, arguments - (clp_s_bench, {}), - (clickhouse_native_json_bench, { + (clp_bench, {}), + (clickhouse_bench, { 'manual_column_names': False, 'keys': [], 'additional_order_by': [], 'timestamp_key': True }), - (clp_presto_bench, { + (presto_clp_bench, { 'dataset_variation': "cleaned_log" }), (parquet_bench, {'mode': 'json string'}), @@ -52,7 +52,7 @@ def get_target_from_name(name): (gzip_bench, {}), ] -def run(bencher, kwargs, bench_target, attach=False): +def run(bencher, kwargs, bench_target, attach=False, attach_on_error=False): dataset_name = 'error when finding dataset name' bench = None try: @@ -70,7 +70,7 @@ def run(bencher, kwargs, bench_target, attach=False): with open((current_dir / 'exceptions.log').resolve(), 'a') as file: file.write(f"{statement}\n") print(statement) - if attach: + if attach or attach_on_error: if bench is not None: bench.docker_attach() else: @@ -80,9 +80,10 @@ def run(bencher, kwargs, bench_target, attach=False): for bench_target in bench_target_dirs: dataset_name = os.path.basename(bench_target.resolve()).strip() - #if dataset_name != 'mongod': # only use mongod for now - # continue - run(bencher, kwargs, bench_target) + if dataset_name != 'mongod': # only use mongod for now + continue + #run(bencher, kwargs, bench_target) + run(bencher, kwargs, bench_target, attach_on_error=True) #run(bencher, kwargs, bench_target, attach=True) #run(sparksql_bench, {}, get_target_from_name('mongod')) From 3b4aa8ede708d2e6509e93ff1c4606ae586ab917 Mon Sep 17 00:00:00 2001 From: John Hao Date: Thu, 31 Jul 2025 14:54:02 -0400 Subject: [PATCH 2/2] make changes according to code review --- assets/presto_parquet/main.py | 4 ++-- scripts/benchall.py | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/assets/presto_parquet/main.py b/assets/presto_parquet/main.py index 671953f..786f545 100755 --- a/assets/presto_parquet/main.py +++ b/assets/presto_parquet/main.py @@ -15,7 +15,7 @@ PARQUET_DATA_PATH = "/home/hive-data" PARQUET_SCHEMA_NAME = "bench_schema" PARQUET_TABLE_NAME = "bench_table" -class parquet_bench(Benchmark): +class presto_parquet_bench(Benchmark): # add any parameters to the tool here def __init__(self, dataset, mode='json string'): super().__init__(dataset) @@ -145,7 +145,7 @@ def terminate(self): time.sleep(10) def main(): - bench = parquet_bench(sys.argv[1]) + bench = presto_parquet_bench(sys.argv[1]) bench.run_everything() if __name__ == "__main__": diff --git a/scripts/benchall.py b/scripts/benchall.py index 8b46e83..53f00b7 100755 --- a/scripts/benchall.py +++ b/scripts/benchall.py @@ -3,7 +3,7 @@ from assets.clp.main import clp_bench from assets.clickhouse.main import clickhouse_bench from assets.sparksql.main import sparksql_bench -from assets.presto_parquet.main import parquet_bench +from assets.presto_parquet.main import presto_parquet_bench from assets.zstandard.main import zstandard_bench from assets.elasticsearch.main import elasticsearch_bench from assets.presto_clp.main import presto_clp_bench @@ -43,8 +43,8 @@ def get_target_from_name(name): (presto_clp_bench, { 'dataset_variation': "cleaned_log" }), - (parquet_bench, {'mode': 'json string'}), - (parquet_bench, {'mode': 'pairwise arrays'}), + (presto_parquet_bench, {'mode': 'json string'}), + (presto_parquet_bench, {'mode': 'pairwise arrays'}), (elasticsearch_bench, {}), (overhead_test_bench, {}), (zstandard_bench, {}), @@ -57,7 +57,6 @@ def run(bencher, kwargs, bench_target, attach=False, attach_on_error=False): bench = None try: dataset_name = os.path.basename(bench_target.resolve()).strip() - # benchmark clp_presto on the cleaned (no spaces) datasets print(f'Benchmarking {bencher.__name__} ({kwargs}) on dataset {dataset_name}') @@ -80,8 +79,8 @@ def run(bencher, kwargs, bench_target, attach=False, attach_on_error=False): for bench_target in bench_target_dirs: dataset_name = os.path.basename(bench_target.resolve()).strip() - if dataset_name != 'mongod': # only use mongod for now - continue + #if dataset_name != 'mongod': # only use mongod for now + # continue #run(bencher, kwargs, bench_target) run(bencher, kwargs, bench_target, attach_on_error=True) #run(bencher, kwargs, bench_target, attach=True)