diff --git a/README.md b/README.md index ac26550..085a396 100644 --- a/README.md +++ b/README.md @@ -7,24 +7,31 @@ Initialize and update submodules: git submodule update --init --recursive ``` -Run the following code to setup the virtual environment, add the python files in src to python's -import path, then run the venv +## Download Datasets -``` -python3 -m venv venv +You can download all the datasets we use in the benchmark using the [download\_all.py](/scripts/download_all.py) script we provide. -echo "$(pwd)" > $(find venv/lib -maxdepth 1 -mindepth 1 -type d)/site-packages/project_root.pth +The [download\_all.py](/scripts/download_all.py) script will download all datasets into the correct directories **with** the specified names, concentrate multi-file datasets together into a single file, and generate any modified version of the dataset needed for tools like Presto \+ CLP. -. venv/bin/activate +## Docker Containers -pip3 install -r requirements.txt -``` +Benchmark services run inside Docker containers to provide reproducible, isolated environments for +test service engines with straightforward setup and teardown. -## Download Datasets +While we use existing published images whenever possible, the `log-archival-bench` repository also +builds and maintains its own service-specific images for benchmark testing. -You can download all the datasets we use in the benchmark using the [download\_all.py](/scripts/download_all.py) script we provide. +To build all benchmark service Docker images in parallel: -The [download\_all.py](/scripts/download_all.py) script will download all datasets into the correct directories **with** the specified names, concentrate multi-file datasets together into a single file, and generate any modified version of the dataset needed for tools like Presto \+ CLP. +```shell +task docker-images:build +``` + +To build an image for a specific service: + +```shell +uv run src/log_archival_bench/scripts/docker_images/build.py --service-name +``` ## Run Everything diff --git a/assets/overhead_test/Dockerfile b/assets/overhead_test/Dockerfile deleted file mode 100644 index 2428ce5..0000000 --- a/assets/overhead_test/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -# This file is used for building the container, ensuring installation of the required tool and -# dependencies - -# If there is any dedicated image available, you should build the benchmarking image on top of that -FROM ghcr.io/y-scope/clp/clp-core-dependencies-x86-ubuntu-jammy:main - -# Install necessary packages -RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y \ - tmux \ - vim - diff --git a/assets/template/Dockerfile b/assets/template/Dockerfile deleted file mode 100644 index 2428ce5..0000000 --- a/assets/template/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -# This file is used for building the container, ensuring installation of the required tool and -# dependencies - -# If there is any dedicated image available, you should build the benchmarking image on top of that -FROM ghcr.io/y-scope/clp/clp-core-dependencies-x86-ubuntu-jammy:main - -# Install necessary packages -RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y \ - tmux \ - vim - diff --git a/pyproject.toml b/pyproject.toml index 3ddc310..70c049d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,8 @@ dev = [ ] [tool.mypy] +explicit_package_bases = true +mypy_path = ["src"] strict = true # Additional output diff --git a/assets/clickhouse/Dockerfile b/src/log_archival_bench/config/docker-images/clickhouse/Dockerfile similarity index 100% rename from assets/clickhouse/Dockerfile rename to src/log_archival_bench/config/docker-images/clickhouse/Dockerfile diff --git a/assets/clickhouse/include/config.xml b/src/log_archival_bench/config/docker-images/clickhouse/include/config.xml similarity index 100% rename from assets/clickhouse/include/config.xml rename to src/log_archival_bench/config/docker-images/clickhouse/include/config.xml diff --git a/assets/clickhouse/include/users.xml b/src/log_archival_bench/config/docker-images/clickhouse/include/users.xml similarity index 100% rename from assets/clickhouse/include/users.xml rename to src/log_archival_bench/config/docker-images/clickhouse/include/users.xml diff --git a/assets/clp/Dockerfile b/src/log_archival_bench/config/docker-images/clp/Dockerfile similarity index 100% rename from assets/clp/Dockerfile rename to src/log_archival_bench/config/docker-images/clp/Dockerfile diff --git a/assets/elasticsearch/Dockerfile b/src/log_archival_bench/config/docker-images/elasticsearch/Dockerfile similarity index 100% rename from assets/elasticsearch/Dockerfile rename to src/log_archival_bench/config/docker-images/elasticsearch/Dockerfile diff --git a/assets/presto_clp/Dockerfile b/src/log_archival_bench/config/docker-images/presto_clp/Dockerfile similarity index 100% rename from assets/presto_clp/Dockerfile rename to src/log_archival_bench/config/docker-images/presto_clp/Dockerfile diff --git a/assets/presto_clp/include/etc_coordinator/catalog/clp.properties b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/catalog/clp.properties similarity index 100% rename from assets/presto_clp/include/etc_coordinator/catalog/clp.properties rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/catalog/clp.properties diff --git a/assets/presto_clp/include/etc_coordinator/config.properties b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/config.properties similarity index 100% rename from assets/presto_clp/include/etc_coordinator/config.properties rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/config.properties diff --git a/assets/presto_clp/include/etc_coordinator/jvm.config b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/jvm.config similarity index 100% rename from assets/presto_clp/include/etc_coordinator/jvm.config rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/jvm.config diff --git a/assets/presto_clp/include/etc_coordinator/jvm.properties b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/jvm.properties similarity index 100% rename from assets/presto_clp/include/etc_coordinator/jvm.properties rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/jvm.properties diff --git a/assets/presto_clp/include/etc_coordinator/log.properties b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/log.properties similarity index 100% rename from assets/presto_clp/include/etc_coordinator/log.properties rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/log.properties diff --git a/assets/presto_clp/include/etc_coordinator/metadata-filter.json b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/metadata-filter.json similarity index 100% rename from assets/presto_clp/include/etc_coordinator/metadata-filter.json rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/metadata-filter.json diff --git a/assets/presto_clp/include/etc_coordinator/node.properties b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/node.properties similarity index 100% rename from assets/presto_clp/include/etc_coordinator/node.properties rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_coordinator/node.properties diff --git a/assets/presto_clp/include/etc_worker/catalog/clp.properties b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_worker/catalog/clp.properties similarity index 100% rename from assets/presto_clp/include/etc_worker/catalog/clp.properties rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_worker/catalog/clp.properties diff --git a/assets/presto_clp/include/etc_worker/config.properties b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_worker/config.properties similarity index 100% rename from assets/presto_clp/include/etc_worker/config.properties rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_worker/config.properties diff --git a/assets/presto_clp/include/etc_worker/node.properties b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_worker/node.properties similarity index 100% rename from assets/presto_clp/include/etc_worker/node.properties rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_worker/node.properties diff --git a/assets/presto_clp/include/etc_worker/velox.properties b/src/log_archival_bench/config/docker-images/presto_clp/include/etc_worker/velox.properties similarity index 100% rename from assets/presto_clp/include/etc_worker/velox.properties rename to src/log_archival_bench/config/docker-images/presto_clp/include/etc_worker/velox.properties diff --git a/assets/presto_parquet/Dockerfile b/src/log_archival_bench/config/docker-images/presto_parquet/Dockerfile similarity index 100% rename from assets/presto_parquet/Dockerfile rename to src/log_archival_bench/config/docker-images/presto_parquet/Dockerfile diff --git a/assets/presto_parquet/include/etc_coordinator/catalog/hive.properties b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/catalog/hive.properties similarity index 100% rename from assets/presto_parquet/include/etc_coordinator/catalog/hive.properties rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/catalog/hive.properties diff --git a/assets/presto_parquet/include/etc_coordinator/config.properties b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/config.properties similarity index 100% rename from assets/presto_parquet/include/etc_coordinator/config.properties rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/config.properties diff --git a/assets/presto_parquet/include/etc_coordinator/jvm.config b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/jvm.config similarity index 100% rename from assets/presto_parquet/include/etc_coordinator/jvm.config rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/jvm.config diff --git a/assets/presto_parquet/include/etc_coordinator/jvm.properties b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/jvm.properties similarity index 100% rename from assets/presto_parquet/include/etc_coordinator/jvm.properties rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/jvm.properties diff --git a/assets/presto_parquet/include/etc_coordinator/log.properties b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/log.properties similarity index 100% rename from assets/presto_parquet/include/etc_coordinator/log.properties rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/log.properties diff --git a/assets/presto_parquet/include/etc_coordinator/metadata-filter.json b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/metadata-filter.json similarity index 100% rename from assets/presto_parquet/include/etc_coordinator/metadata-filter.json rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/metadata-filter.json diff --git a/assets/presto_parquet/include/etc_coordinator/node.properties b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/node.properties similarity index 100% rename from assets/presto_parquet/include/etc_coordinator/node.properties rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_coordinator/node.properties diff --git a/assets/presto_parquet/include/etc_worker/catalog/hive.properties b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_worker/catalog/hive.properties similarity index 100% rename from assets/presto_parquet/include/etc_worker/catalog/hive.properties rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_worker/catalog/hive.properties diff --git a/assets/presto_parquet/include/etc_worker/config.properties b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_worker/config.properties similarity index 100% rename from assets/presto_parquet/include/etc_worker/config.properties rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_worker/config.properties diff --git a/assets/presto_parquet/include/etc_worker/node.properties b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_worker/node.properties similarity index 100% rename from assets/presto_parquet/include/etc_worker/node.properties rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_worker/node.properties diff --git a/assets/presto_parquet/include/etc_worker/velox.properties b/src/log_archival_bench/config/docker-images/presto_parquet/include/etc_worker/velox.properties similarity index 100% rename from assets/presto_parquet/include/etc_worker/velox.properties rename to src/log_archival_bench/config/docker-images/presto_parquet/include/etc_worker/velox.properties diff --git a/assets/sparksql/Dockerfile b/src/log_archival_bench/config/docker-images/sparksql/Dockerfile similarity index 100% rename from assets/sparksql/Dockerfile rename to src/log_archival_bench/config/docker-images/sparksql/Dockerfile diff --git a/assets/zstandard/Dockerfile b/src/log_archival_bench/config/docker-images/zstandard/Dockerfile similarity index 100% rename from assets/zstandard/Dockerfile rename to src/log_archival_bench/config/docker-images/zstandard/Dockerfile diff --git a/src/log_archival_bench/scripts/docker_images/__init__.py b/src/log_archival_bench/scripts/docker_images/__init__.py new file mode 100644 index 0000000..5838a8a --- /dev/null +++ b/src/log_archival_bench/scripts/docker_images/__init__.py @@ -0,0 +1 @@ +"""Scripts related to Docker images and containers.""" diff --git a/src/log_archival_bench/scripts/docker_images/build.py b/src/log_archival_bench/scripts/docker_images/build.py new file mode 100755 index 0000000..d32665c --- /dev/null +++ b/src/log_archival_bench/scripts/docker_images/build.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +"""Builds a Docker image for the specified benchmark service.""" + +import argparse +import subprocess +import sys +from pathlib import Path + +from log_archival_bench.scripts.docker_images.utils import get_image_name, validate_service_name +from log_archival_bench.utils.project_config import CONFIG_DIR, PACKAGE_ROOT + + +def main(argv: list[str]) -> int: + """ + Builds a Docker image for the specified benchmark service. + + :param argv: + :return: 0 on success, non-zero error code on failure. + """ + args_parser = argparse.ArgumentParser() + args_parser.add_argument( + "--service-name", + required=True, + help="The benchmark service that the built Docker image will provide.", + ) + + parsed_args = args_parser.parse_args(argv[1:]) + service_name = parsed_args.service_name + + validate_service_name(service_name) + + docker_file_path = Path(CONFIG_DIR) / "docker-images" / service_name / "Dockerfile" + if not docker_file_path.is_file(): + err_msg = f"Dockerfile for `{service_name}` does not exist in {CONFIG_DIR}/docker-images." + raise RuntimeError(err_msg) + + # fmt: off + build_cmds = [ + "docker", + "build", + "--tag", get_image_name(service_name), + "--file", str(docker_file_path), + str(PACKAGE_ROOT), + ] + # fmt: on + subprocess.run(build_cmds, check=True) + + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/src/log_archival_bench/scripts/docker_images/utils.py b/src/log_archival_bench/scripts/docker_images/utils.py new file mode 100644 index 0000000..c9384a7 --- /dev/null +++ b/src/log_archival_bench/scripts/docker_images/utils.py @@ -0,0 +1,26 @@ +"""Shared helpers for Docker image scripts.""" + +import os + + +def get_image_name(service_name: str) -> str: + """ + :param service_name: + :return: The name assigned to the Docker image that contains the service. + """ + user = os.getenv("USER", "clp-user") + return f"log-archival-bench-{service_name}-ubuntu-jammy:dev-{user}" + + +def validate_service_name(service_name: str) -> None: + """ + :param service_name: The name of the benchmark service. + :raise: ValueError if the service is invalid. + """ + # NOTE: Keep in sync with `G_BENCHMARK_DOCKER_SERVICES` in taskfiles/docker-images/main.yaml + valid_services = ["clickhouse", "clp", "elasticsearch", "sparksql", "zstandard"] + if service_name not in valid_services: + err_msg = ( + f"Invalid service name `{service_name}`. Valid services: {', '.join(valid_services)}" + ) + raise ValueError(err_msg) diff --git a/src/log_archival_bench/utils/__init__.py b/src/log_archival_bench/utils/__init__.py new file mode 100644 index 0000000..17e3f67 --- /dev/null +++ b/src/log_archival_bench/utils/__init__.py @@ -0,0 +1 @@ +"""Scripts providing general python utilities for the project.""" diff --git a/src/log_archival_bench/utils/project_config.py b/src/log_archival_bench/utils/project_config.py new file mode 100644 index 0000000..b3b359f --- /dev/null +++ b/src/log_archival_bench/utils/project_config.py @@ -0,0 +1,11 @@ +"""Project configurations.""" + +from pathlib import Path + +import log_archival_bench + +# Constants +PACKAGE_ROOT = Path(log_archival_bench.__file__).parent + +BUILD_DIR = PACKAGE_ROOT / "build" +CONFIG_DIR = PACKAGE_ROOT / "config" diff --git a/taskfile.yaml b/taskfile.yaml index 05d1d44..37dbe37 100644 --- a/taskfile.yaml +++ b/taskfile.yaml @@ -5,9 +5,11 @@ shopt: ["globstar"] includes: lint: "taskfiles/lint/main.yaml" + docker-images: "taskfiles/docker-images/main.yaml" vars: G_OUTPUT_DIR: "{{.ROOT_DIR}}/build" + G_PROJECT_SRC_DIR: "{{.ROOT_DIR}}/src/log_archival_bench" tasks: clean: diff --git a/taskfiles/docker-images/main.yaml b/taskfiles/docker-images/main.yaml new file mode 100644 index 0000000..8d0ee7d --- /dev/null +++ b/taskfiles/docker-images/main.yaml @@ -0,0 +1,41 @@ +version: "3" + +includes: + utils: + internal: true + taskfile: "../../tools/yscope-dev-utils/exports/taskfiles/utils/utils.yaml" + +vars: + G_BENCHMARK_DOCKER_SERVICES: + - "clickhouse" + - "clp" + - "elasticsearch" + # Note: Presto-related service images currently fail to build and are pending fixes. + #- "presto_clp" + #- "presto_parquet" + - "sparksql" + - "zstandard" + G_DOCKER_IMAGE_SCRIPT_DIR: "{{.G_PROJECT_SRC_DIR}}/scripts/docker_images" + +tasks: + build: + # Build Docker images for all containerized benchmark services in parallel. + run: "once" + deps: + - for: + var: "G_BENCHMARK_DOCKER_SERVICES" + task: "build-single-benchmark-service-image" + vars: + SERVICE_NAME: "{{.ITEM}}" + + build-single-benchmark-service-image: + # Builds a Docker image for the specified benchmark service. Runs only once per unique service. + # + # @param {string} SERVICE_NAME The benchmark service that the built Docker image will provide. + internal: true + label: "{{.TASK}}:{{.SERVICE_NAME}}" + requires: + vars: ["SERVICE_NAME"] + run: "when_changed" + cmds: + - "uv run '{{.G_DOCKER_IMAGE_SCRIPT_DIR}}/build.py' --service-name {{.SERVICE_NAME}}"