ClickHouse · wangmj17 · Sep 15, 2025 · Oct 9, 2025 · murphyatwork · Sep 18, 2025
diff --git a/hologres/README.md b/hologres/README.md
@@ -0,0 +1,23 @@
+Hologres is an all-in-one real-time data warehouse engine that is compatible with PostgreSQL. It supports online analytical processing (OLAP) and ad hoc analysis of PB-scale data. Hologres supports online data serving at high concurrency and low latency.
+
+To evaluate the performance of Hologres, follow these guidelines to set up and execute the benchmark tests.
+
+1. **Instance Purchase**:  
+   Refer to the [Alibaba Cloud Hologres TPC-H Testing Documentation](https://www.alibabacloud.com/help/en/hologres/user-guide/test-plan?spm=a2c63.p38356.help-menu-113622.d_2_14_0_0.54e14f70oTAEXO) for details on purchasing Hologres and ECS instances. Both instances must be purchased within the same region and same zone.
+
+2. **Benchmark Execution**:  
+   Once the instances are set up, you need to prepare the following parameters:
+   - `user`: user name for hologres, you can create users on Hologres web console
+   - `password`: password for hologres, you can set this when create users
+   - `host_name`: hostname of the Hologres instance, you can find this on Alibaba Cloud Console, you should select VPC network to achieve best performance
+   - `port`: Port of the Hologres instance (usally '80')
+
+   And then setup environments variables:
+   ```
+   export PG_USER={user};export PG_PASSWORD={password};export PG_HOSTNAME={host_name};export PG_PORT={port}
+   ```
+
+3. **Sample Execution**:
+   ```bash
+   ./main.sh 5 /root/bluesky
+   ```
diff --git a/hologres/benchmark.sh b/hologres/benchmark.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <DB_NAME> [RESULT_FILE]"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+RESULT_FILE="${2:-}"
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
+
+# Construct the query log file name using $DB_NAME
+# QUERY_LOG_FILE="${OUTPUT_PREFIX}_query_log_${DB_NAME}.txt"
+QUERY_LOG_FILE="${OUTPUT_PREFIX}_${DB_NAME}.query_log"
+
+# Print the database name
+echo "Running queries on database: $DB_NAME"
+
+# Run queries and log the output
+./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE"
+
+# Process the query log and prepare the result
+RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Time: \d+\.\d+ ms' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' | \
+awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
+
+# Output the result
+if [[ -n "$RESULT_FILE" ]]; then
+    echo "$RESULT" > "$RESULT_FILE"
+    echo "Result written to $RESULT_FILE"
+else
+    echo "$RESULT"
+fi
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"
diff --git a/hologres/count.sh b/hologres/count.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+TABLE_NAME="$2"
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
+
+# Corrected SQL query
+$HOLOGRES_PSQL -d "$DB_NAME" -t -c "SELECT count(*) from $TABLE_NAME"
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"
diff --git a/hologres/create_and_load.sh b/hologres/create_and_load.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# set -e
+
+# Check if the required arguments are provided
+if [[ $# -lt 7 ]]; then
+    echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+TABLE_NAME="$2"
+DDL_FILE="$3"
+DATA_DIRECTORY="$4"
+NUM_FILES="$5"
+SUCCESS_LOG="$6"
+ERROR_LOG="$7"
+
+# Validate arguments
+[[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
+[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
+[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
+
+echo "Drop and create database"
+$HOLOGRES_PSQL -c "DROP DATABASE IF EXISTS $DB_NAME" -c "CREATE DATABASE $DB_NAME"
+echo "Disable result cache."
+$HOLOGRES_PSQL -c "ALTER DATABASE $DB_NAME SET hg_experimental_enable_result_cache TO off;"
+
+echo "Execute DDL"
+$HOLOGRES_PSQL -d "$DB_NAME" -t < "$DDL_FILE"
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] Load data"
+./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] Vacuum analyze the table"
+$HOLOGRES_PSQL -d "$DB_NAME" -c '\timing' -c "VACUUM $TABLE_NAME"
+$HOLOGRES_PSQL -d "$DB_NAME" -c '\timing' -c "ANALYZE $TABLE_NAME"
+$HOLOGRES_PSQL -d "$DB_NAME" -c '\timing' -c "select hologres.hg_full_compact_table('$TABLE_NAME')"
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"
diff --git a/hologres/ddl.sql b/hologres/ddl.sql
@@ -0,0 +1,7 @@
+CREATE TABLE bluesky (
+    data JSONB NOT NULL
+);
+
+ALTER TABLE bluesky ALTER COLUMN data SET (enable_columnar_type = ON);
+CALL set_table_property('bluesky', 'dictionary_encoding_columns', 'data:auto');
+CALL set_table_property('bluesky', 'bitmap_columns', 'data:auto');
diff --git a/hologres/drop_tables.sh b/hologres/drop_tables.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <DB_NAME>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
+
+# echo "Dropping database"
+$HOLOGRES_PSQL -c "DROP DATABASE $DB_NAME"
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"
diff --git a/hologres/index_usage.sh b/hologres/index_usage.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <DB_NAME>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+EXPLAIN_CMD="$2"
+
+QUERY_NUM=1
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
+
+cat queries.sql | while read -r query; do
+
+    # Print the query number
+    echo "------------------------------------------------------------------------------------------------------------------------"
+    echo "Index usage for query Q$QUERY_NUM:"
+    echo
+
+    $HOLOGRES_PSQL -d "$DB_NAME" -t -c "$EXPLAIN_CMD $query"
+
+    # Increment the query number
+    QUERY_NUM=$((QUERY_NUM + 1))
+
+done;
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"
diff --git a/hologres/install.sh b/hologres/install.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# https://www.postgresql.org/download/linux/ubuntu/
+
+sudo apt-get update
+sudo apt-get install -y postgresql-common postgresql-16
diff --git a/hologres/load_data.sh b/hologres/load_data.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+
+# set -e
+
+# Check if the required arguments are provided
+if [[ $# -lt 6 ]]; then
+    echo "Usage: $0 <directory> <database_name> <table_name> <max_files> <success_log> <error_log>"
+    exit 1
+fi
+
+# Arguments
+DIRECTORY="$1"
+DIRECTORY=`realpath $DIRECTORY`
+DB_NAME="$2"
+TABLE_NAME="$3"
+MAX_FILES="$4"
+SUCCESS_LOG="$5"
+ERROR_LOG="$6"
+PSQL_CMD="$HOLOGRES_PSQL -d $DB_NAME"
+
+FORCE_REPROCESS=0
+SAVE_INTO_CACHE=1
+CACHE_DIR=${DIRECTORY}/cleaned
+
+# Validate that MAX_FILES is a number
+if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
+    echo "Error: <max_files> must be a positive integer."
+    exit 1
+fi
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
+
+# Ensure the log files exist
+touch "$SUCCESS_LOG" "$ERROR_LOG"
+echo "SUCCESS_LOG $SUCCESS_LOG"
+echo "ERROR_LOG $ERROR_LOG"
+
+echo "---------------------------"
+echo "FORCE_REPROCESS=$FORCE_REPROCESS"
+echo "SAVE_INTO_CACHE=$SAVE_INTO_CACHE"
+echo "CACHE_DIR=$CACHE_DIR"
+echo "---------------------------"
+
+# Create a temporary directory in /var/tmp and ensure it's accessible
+TEMP_DIR=$(mktemp -d /var/tmp/cleaned_files.XXXXXX)
+chmod 777 "$TEMP_DIR"  # Allow access for all users
+trap "rm -rf $TEMP_DIR" EXIT  # Ensure cleanup on script exit
+
+# Counter to track processed files
+counter=0
+
+# Loop through each .json.gz file in the directory
+for file in $(ls "$DIRECTORY"/*.json.gz | sort); do
+    if [[ -f "$file" ]]; then
+
+        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Processing $file ..."
+        counter=$((counter + 1))
+
+        filename=$(basename "$file" .gz)  # e.g., data.json
+        cleaned_basename="${filename%.json}_cleaned.json"  # e.g., data_cleaned.json
+
+        # 定义缓存文件路径（最终保存位置）
+        cached_file=`realpath $CACHE_DIR/$cleaned_basename`
+
+        # 如果缓存文件已经存在，就不再处理
+        if [[ -f "$cached_file" && "$FORCE_REPROCESS" == 0 ]]; then
+            echo "[$(date '+%Y-%m-%d %H:%M:%S')] Cached file exists: $cached_file - skipping processing."
+            cleaned_file="$cached_file"
+        else
+            # Uncompress the file into the temporary directory
+            uncompressed_file="$TEMP_DIR/$filename"
+            echo "[$(date '+%Y-%m-%d %H:%M:%S')] gunzip: $file ..."
+            gunzip -c "$file" > "$uncompressed_file"
+
+            # Check if uncompression was successful
+            if [[ $? -ne 0 ]]; then
+                echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." | tee -a "$ERROR_LOG"
+                continue
+            fi
+            echo "[$(date '+%Y-%m-%d %H:%M:%S')] gunzip done: $uncompressed_file"
+            # head -n 1 "$uncompressed_file"
+
+            # Preprocess the file to remove null characters
+            cleaned_file="$TEMP_DIR/$(basename "${uncompressed_file%.json}_cleaned.json")"
+            cleaned_file_realpath=`realpath $cleaned_file`
+            # sed 's/\\u0000//g' "$uncompressed_file" > "$cleaned_file"
+            # 将跨越两行的 JSON 合并为一行（可以使导入成功率超过 99% ）
+            sed 's/\\u0000//g' "$uncompressed_file" | awk 'NR == 1 { printf "%s", $0; next } /^{/ { printf "\n%s", $0; next } { printf "%s", $0 } END { print "" }' > "$cleaned_file"
+
+            # head -n 1 "$cleaned_file"
+
+            # Grant read permissions for the postgres user
+            chmod 644 "$cleaned_file"
+
+            if [[ "$SAVE_INTO_CACHE" != 0 ]]; then
+                # 将 clean 后的文件保存到指定目录作为缓存
+                mkdir -p "$CACHE_DIR"
+                cp "$cleaned_file" "$cached_file"
+                echo "[$(date '+%Y-%m-%d %H:%M:%S')] Saved cleaned file to cache: `realpath $cached_file`"
+            fi
+        fi
+
+        # cp "$cleaned_file" /tmp/1.json
+        echo `wc -l $cleaned_file`
+
+        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Start importing $cleaned_file into Hologres." | tee -a "$SUCCESS_LOG"
+
+        max_retries=3
+        timeout_seconds=90
+        attempt=1
+
+        # Import the cleaned JSON file into Hologres
+
+        until [ $attempt -gt $max_retries ]; do
+            echo "($attempt) Try to copy data ..."
+            timeout $timeout_seconds $PSQL_CMD -c "\COPY $TABLE_NAME FROM '$cleaned_file' WITH (format csv, quote e'\x01', delimiter e'\x02', escape e'\x01');"
+
+            import_status=$?
+
+            if [ $import_status -ne 124 ]; then
+                break
+            fi
+
+            attempt=$((attempt + 1))
+            sleep 1
+        done
+
+        # Check if the import was successful
+        if [[ $import_status -eq 0 ]]; then
+            echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $cleaned_file into Hologres." | tee -a "$SUCCESS_LOG"
+            # Delete both the uncompressed and cleaned files after successful processing
+            rm -f "$uncompressed_file" "$cleaned_file_realpath"
+        else
+            echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $cleaned_file. See errors above." | tee -a "$ERROR_LOG"
+            # Keep the files for debugging purposes
+        fi
+
+        # Stop processing if the max number of files is reached
+        if [[ $counter -ge $MAX_FILES ]]; then
+            echo "Processed maximum number of files: $MAX_FILES"
+            break
+        fi
+    else
+        echo "No .json.gz files found in the directory."
+    fi
+done
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"