y-scope · anlowee · Jul 28, 2025 · Jul 27, 2025 · Jul 27, 2025 · Jul 27, 2025
@@ -103,7 +103,8 @@ tasks:
           components/package-template/src/etc \
           docs \
           taskfile.yaml \
-          taskfiles
+          taskfiles \
+          tools/deployment
 
   check-cpp-format:
     sources: &cpp_source_files
@@ -772,6 +773,7 @@ tasks:
           - "components/clp-py-utils/clp_py_utils"
           - "components/core/tools/scripts/utils"
           - "components/job-orchestration/job_orchestration"
+          - "tools/deployment"
           - "tools/scripts"
           - "docs/conf"
         cmd: |-

diff --git a/tools/deployment/presto-clp/README.md b/tools/deployment/presto-clp/README.md
@@ -0,0 +1,73 @@
+# Setup local docker stack for presto + clp
+
+## Install docker
+
+Follow the guide here: [docker]
+
+# Launch clp-package
+
+1. Find the clp-package for test on our official website [clp-json-v0.4.0]. Here is a sample dataset for demo testing: [postgresql dataset].
+
+2. Untar the clp-package and the postgresql dataset.
+
+3. Launch:
+
+```bash
+# You probably want to run a python 3.9 or newer virtual environment
+sbin/start-clp.sh
+```
+
+5. Compress:
+
+```bash
+# You can also use your own dataset
+sbin/compress.sh --timestamp-key 'timestamp' /path/to/postgresql.log
+```
+
+6. Use the following command to update `.env`:
+
+```bash
+scripts/set-up-config.sh /path/to/clp-json-package
+```
+
+# Create Docker Cluster
+
+Create a local docker stack:
+
+```bash
+docker compose up
+```
+
+To create a docker stack with more than 1 worker (e.g., 3 workers):
+```
+docker compose up --scale presto-worker=3
+```
+
+# Use cli:
+
+After all containers are in "Started" states (check by `docker ps`):
+
+```bash
+# On your host
+docker exec -it compose-presto-coordinator-1 sh
+
+# In presto-coordinator container
+/opt/presto-cli --catalog clp --schema default --server localhost:8080
+```
+
+Example query:
+```sql
+SELECT * FROM default LIMIT 1;
+```
+
+# Delete docker Cluster
+
+```bash
+docker compose down
+```
+
+
+
+[clp-json-v0.4.0]: https://github.com/y-scope/clp/releases/tag/v0.4.0
+[docker]: https://docs.docker.com/engine/install
+[postgresql dataset]: https://zenodo.org/records/10516402
@@ -0,0 +1,5 @@
+PRESTO_COORDINATOR_HTTPPORT="8080"
+PRESTO_COORDINATOR_SERVICENAME="presto-coordinator"
+
+# node.properties
+PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT="production"
@@ -0,0 +1,14 @@
+# clp.properties
+PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_PROVIDERTYPE="mysql"
+PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_SPLITPROVIDER="mysql"
+
+# config.properties
+PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORY="1GB"
+PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORYPERNODE="1GB"
+
+# jvm.config
+PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE="4G"
+PRESTO_COORDINATOR_CONFIG_JVMCONFIG_G1HEAPREGIONSIZE="32M"
+
+# log.properties
+PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL="DEBUG"
@@ -0,0 +1,9 @@
+connector.name=clp
+clp.metadata-provider-type=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_PROVIDERTYPE}
+clp.metadata-db-url=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL}
+clp.metadata-db-name=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME}
+clp.metadata-db-user=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER}
+clp.metadata-db-password=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD}
+clp.metadata-table-prefix=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX}
+clp.split-provider-type=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_SPLITPROVIDER}
+clp.metadata-filter-config=/opt/presto-server/etc/metadata-filter.json
@@ -0,0 +1,13 @@
+coordinator=true
+node-scheduler.include-coordinator=false
+http-server.http.port=${PRESTO_COORDINATOR_HTTPPORT}
+query.max-memory=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORY}
+query.max-memory-per-node=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORYPERNODE}
+discovery-server.enabled=true
+discovery.uri=http://${PRESTO_COORDINATOR_SERVICENAME}:${PRESTO_COORDINATOR_HTTPPORT}
+optimizer.optimize-hash-generation=false
+regex-library=RE2J
+use-alternative-function-signatures=true
+inline-sql-functions=false
+nested-data-serialization-enabled=false
+native-execution-enabled=true
@@ -0,0 +1,9 @@
+-server
+-Xmx${PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE}
+-XX:+UseG1GC
+-XX:G1HeapRegionSize=${PRESTO_COORDINATOR_CONFIG_JVMCONFIG_G1HEAPREGIONSIZE}
+-XX:+UseGCOverheadLimit
+-XX:+ExplicitGCInvokesConcurrent
+-XX:+HeapDumpOnOutOfMemoryError
+-XX:+ExitOnOutOfMemoryError
+-Djdk.attach.allowAttachSelf=true
@@ -0,0 +1 @@
+com.facebook.presto=${PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL}
-com.facebook.presto=${PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL}
+com.facebook.presto=${PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL:-INFO}
-com.facebook.presto=${PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL}
+com.facebook.presto=${PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL:-INFO}
@@ -0,0 +1,2 @@
+{
+}
@@ -0,0 +1,2 @@
+node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT}
+node.id=${PRESTO_COORDINATOR_SERVICENAME}
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+# Exit on error
+set -e
+
+PRESTO_CONFIG_DIR="/opt/presto-server/etc"
+
+# Substitute environemnt variables in config template
+find /configs -type f | while read -r f; do
+  ( echo "cat <<EOF"; cat $f; echo "EOF" ) | sh > "${PRESTO_CONFIG_DIR}/$(basename "$f")"
+done
-# Substitute environemnt variables in config template
-find /configs -type f | while read -r f; do
-  ( echo "cat <<EOF"; cat $f; echo "EOF" ) | sh > "${PRESTO_CONFIG_DIR}/$(basename "$f")"
-done
+# Substitute environment variables in config template
+find /configs -type f | while read -r f; do
+  envsubst < "$f" > "${PRESTO_CONFIG_DIR}/$(basename "$f")"
+done
-# Substitute environemnt variables in config template
-find /configs -type f | while read -r f; do
-  ( echo "cat <<EOF"; cat $f; echo "EOF" ) | sh > "${PRESTO_CONFIG_DIR}/$(basename "$f")"
-done
+# Substitute environment variables in config template
+find /configs -type f | while read -r f; do
+  envsubst < "$f" > "${PRESTO_CONFIG_DIR}/$(basename "$f")"
+done
+
+# Setup the config directory hierarchy
+rm -f ${PRESTO_CONFIG_DIR}/catalog/*
+
+# Copy over files
+mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog
@@ -0,0 +1,42 @@
+version: "3.9"
+
+services:
+  presto-coordinator:
+    image: "ghcr.io/y-scope/presto/coordinator:dev"
+    entrypoint: ["/bin/bash", "-c", "/scripts/generate-configs.sh && /opt/entrypoint.sh"]
+    env_file:
+      - ".env"
+      - "coordinator-common.env"
+      - "coordinator.env"
+    volumes:
+      - "./coordinator/scripts:/scripts:ro"
+      - "coordinator-config:/opt/presto-server/etc"
+      - "./coordinator/config-template:/configs:ro"
+    networks:
+      - "presto"
+
+  presto-worker:
+    image: "ghcr.io/y-scope/presto/prestissimo-worker:dev"
+    depends_on:
+      presto-coordinator:
+        condition: "service_started"
+    entrypoint: ["/bin/bash", "-c", "/scripts/generate-configs.sh && /opt/entrypoint.sh"]
+    env_file:
+      - ".env"
+      - "coordinator-common.env"
+      - "worker.env"
+    volumes:
+      - "./worker/scripts:/scripts:ro"
+      - "worker-config:/opt/presto-server/etc"
+      - "./worker/config-template:/configs:ro"
+      - "${CLP_PACKAGE_ARCHIVES}:${CLP_PACKAGE_ARCHIVES}"
+    networks:
+      - "presto"
+
+volumes:
+  coordinator-config:
+  worker-config:
+
+networks:
+  presto:
+    driver: "bridge"
@@ -0,0 +1 @@
+/.venv/
@@ -0,0 +1,120 @@
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Optional
+
+import yaml
+
+# Set up console logging
+logging_console_handler = logging.StreamHandler()
+logging_formatter = logging.Formatter(
+    "%(asctime)s.%(msecs)03d %(levelname)s [%(module)s] %(message)s", datefmt="%Y-%m-%dT%H:%M:%S"
+)
+logging_console_handler.setFormatter(logging_formatter)
+
+# Set up root logger
+root_logger = logging.getLogger()
+root_logger.setLevel(logging.INFO)
+root_logger.addHandler(logging_console_handler)
+
+# Create logger
+logger = logging.getLogger(__name__)
-# Set up console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter(
-    "%(asctime)s.%(msecs)03d %(levelname)s [%(module)s] %(message)s", datefmt="%Y-%m-%dT%H:%M:%S"
-)
-logging_console_handler.setFormatter(logging_formatter)
-
-# Set up root logger
-root_logger = logging.getLogger()
-root_logger.setLevel(logging.INFO)
-root_logger.addHandler(logging_console_handler)
-
-# Create logger
-logger = logging.getLogger(__name__)
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s.%(msecs)03d %(levelname)s [%(module)s] %(message)s",
+    datefmt="%Y-%m-%dT%H:%M:%S"
+)
+logger = logging.getLogger(__name__)
-# Set up console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter(
-    "%(asctime)s.%(msecs)03d %(levelname)s [%(module)s] %(message)s", datefmt="%Y-%m-%dT%H:%M:%S"
-)
-logging_console_handler.setFormatter(logging_formatter)
-
-# Set up root logger
-root_logger = logging.getLogger()
-root_logger.setLevel(logging.INFO)
-root_logger.addHandler(logging_console_handler)
-
-# Create logger
-logger = logging.getLogger(__name__)
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s.%(msecs)03d %(levelname)s [%(module)s] %(message)s",
+    datefmt="%Y-%m-%dT%H:%M:%S"
+)
+logger = logging.getLogger(__name__)
+
+
+def main(argv=None) -> int:
+    if argv is None:
+        argv = sys.argv
+
+    args_parser = argparse.ArgumentParser(
+        description="Generates an environment variables file for any user-configured properties."
+    )
+    args_parser.add_argument(
+        "--clp-package-dir", help="CLP package directory.", required=True, type=Path
+    )
+    args_parser.add_argument(
+        "--output-file", help="Path for the environment variables file.", required=True, type=Path
+    )
+
+    parsed_args = args_parser.parse_args(argv[1:])
+    clp_package_dir: Path = parsed_args.clp_package_dir.resolve()
+    output_file: Path = parsed_args.output_file
+
+    clp_config_file_path = clp_package_dir / "etc" / "clp-config.yml"
+    with open(clp_config_file_path, "r") as clp_config_file:
+        clp_config = yaml.safe_load(clp_config_file)
+
+    database_host = _get_config_value(clp_config, "database.host", "localhost")
+    database_port = _get_config_value(clp_config, "database.port", 3306)
+    database_name = _get_config_value(clp_config, "database.name", "clp-db")
+
+    clp_archive_output_storage_type = _get_config_value(
+        clp_config, "archive_output.storage.type", "fs"
+    )
+    if "fs" != clp_archive_output_storage_type:
+        logger.error(
+            "Expected CLP's archive_output.storage.type to be fs but found '%s'. Presto currently only supports"
+            " reading archives from the fs storage type.",
+            clp_archive_output_storage_type,
+        )
+
+    clp_archives_dir = _get_config_value(
+        clp_config,
+        "archive_output.storage.directory",
+        str(clp_package_dir / "var" / "data" / "archives"),
+    )
+
+    credentials_file_path = clp_package_dir / "etc" / "credentials.yml"
+    with open(credentials_file_path, "r") as credentials_file:
+        credentials = yaml.safe_load(credentials_file)
+
+    database_user = _get_config_value(credentials, "database.user")
+    database_password = _get_config_value(credentials, "database.password")
+    if not database_user or not database_password:
+        logger.error(
+            "database.user and database.password must be specified in '%s'.", credentials_file_path
+        )
+        return 1
+
+    with open(output_file, "w") as env_file:
+        env_file.write(
+            "PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL"
+            f"=jdbc:mysql://{database_host}:{database_port}\n"
+        )
+        env_file.write(
+            f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME={database_name}\n"
+        )
+        env_file.write(
+            f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER={database_user}\n"
+        )
+        env_file.write(
+            f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD={database_password}\n"
+        )
+        env_file.write(f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX=clp_\n")
+        env_file.write(f"CLP_PACKAGE_ARCHIVES={clp_archives_dir}\n")
+
+    return 0
+
+
+def _get_config_value(config: dict, key: str, default_value: Optional[str] = None) -> str:
+    """
+    Gets the value corresponding to `key` from `config` if it exists.
+
+    :param config: The config.
+    :param key: The key to look for in the config, in dot notation (e.g., "database.host").
+    :param default_value: The value to return if `key` doesn't exist in `config`.
+    :return: The value corresponding to `key` if it exists, otherwise `default_value`.
+    """
+
+    keys = key.split(".")
+    value = config
+    for k in keys:
+        if isinstance(value, dict) and k in value:
+            value = value[k]
+        else:
+            return default_value
+    return value
+
+
+if "__main__" == __name__:
+    sys.exit(main(sys.argv))
@@ -0,0 +1 @@
+PyYAML
-PyYAML
+# tools/deployment/presto-clp/scripts/requirements.txt
+-PyYAML
+PyYAML>=6.0,<7.0
-PyYAML
+# tools/deployment/presto-clp/scripts/requirements.txt
+-PyYAML
+PyYAML>=6.0,<7.0
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -eu
+set -o pipefail
+
+script_dir=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+cUsage="Usage: ${BASH_SOURCE[0]} <clp-package-dir>"
+if [ "$#" -lt 1 ] ; then
+    echo "$cUsage"
+    exit
+fi
-if [ "$#" -lt 1 ] ; then
-    echo "$cUsage"
-    exit
-fi
+if [ "$#" -lt 1 ] ; then
+    echo "$cUsage" >&2
+    exit 1
+fi
-if [ "$#" -lt 1 ] ; then
-    echo "$cUsage"
-    exit
-fi
+if [ "$#" -lt 1 ] ; then
+    echo "$cUsage" >&2
+    exit 1
+fi
+clp_package_dir=$1
+
-clp_package_dir=$1
+clp_package_dir="$1"
-clp_package_dir=$1
+clp_package_dir="$1"
+venv_dir=${script_dir}/.venv
+if [ ! -d "${venv_dir}" ]; then
+    echo "Setting up Python venv in '${venv_dir}'..."
+    python3 -m venv "${script_dir}/.venv"
+fi
+source "${script_dir}/.venv/bin/activate"
+
+echo "Installing required Python packages..."
+pip3 install -r "${script_dir}/requirements.txt"
+
+echo "Generating environment variables file for user-configured properties..."
+python3 "${script_dir}/generate-user-env-vars-file.py" \
+    --clp-package-dir "${clp_package_dir}" \
+    --output-file "${script_dir}/../.env"
@@ -0,0 +1,4 @@
+PRESTO_WORKER_HTTPPORT="8080"
+
+# node.properties
+PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION="worker-location"
-PRESTO_WORKER_HTTPPORT="8080"
-
-# node.properties
-PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION="worker-location"
+PRESTO_WORKER_HTTP_PORT="8080"
+
+# node.properties
+PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION="worker-location"
-PRESTO_WORKER_HTTPPORT="8080"
-
-# node.properties
-PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION="worker-location"
+PRESTO_WORKER_HTTP_PORT="8080"
+
+# node.properties
+PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION="worker-location"
@@ -0,0 +1 @@
+connector.name=clp
@@ -0,0 +1,6 @@
+discovery.uri=http://${PRESTO_COORDINATOR_SERVICENAME}:${PRESTO_COORDINATOR_HTTPPORT}
+presto.version=REPLACE_ME
-presto.version=REPLACE_ME
+presto.version=${PRESTO_VERSION}
-presto.version=REPLACE_ME
+presto.version=${PRESTO_VERSION}
+http-server.http.port=${PRESTO_WORKER_HTTPPORT}
+shutdown-onset-sec=1
+register-test-functions=false
+runtime-metrics-collection-enabled=false
@@ -0,0 +1,4 @@
+node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT}
+node.internal-address=REPLACE_ME
+node.location=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION}
+node.id=REPLACE_ME
-node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT}
-node.internal-address=REPLACE_ME
-node.location=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION}
-node.id=REPLACE_ME
+node.environment=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_ENVIRONMENT}
+node.internal-address=REPLACE_ME
+node.location=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION}
+node.id=REPLACE_ME
-node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT}
-node.internal-address=REPLACE_ME
-node.location=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION}
-node.id=REPLACE_ME
+node.environment=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_ENVIRONMENT}
+node.internal-address=REPLACE_ME
+node.location=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION}
+node.id=REPLACE_ME
@@ -0,0 +1 @@
+mutable-config=true
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{
		}
Comment on lines +1 to +2 Copy link Contributor coderabbitai bot Jul 27, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick (assertive) Provide a minimal example or document expected schema An empty JSON object is syntactically valid, but future maintainers may be unsure what keys are supported. A commented exemplar or pointer to docs beside this file would improve clarity without affecting runtime. 🤖 Prompt for AI Agents `In tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json at lines 1 to 2, the JSON file is currently empty, which may confuse future maintainers about the expected keys. Add a minimal example JSON object with typical keys and values or include comments or a reference to documentation explaining the expected schema to improve clarity without impacting runtime behavior.` Comment on lines +1 to +2 Copy link Member kirkrodrigues Jul 28, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. @anlowee I think we should explain to the user how to configure this for the timestamp field in their logs, right? And also that it may need to be different for each dataset they compress. Copy link Contributor Author anlowee Jul 28, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Can we direct them to the related presto-doc section? Copy link Member kirkrodrigues Jul 28, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. That's not published and is more general than they need, right? We should write a simplified section for them here.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT}
		node.id=${PRESTO_COORDINATOR_SERVICENAME}