Skip to content

Commit 41d45c8

Browse files
committed
add Alibaba Cloud Hologres
1 parent a1389f4 commit 41d45c8

19 files changed

+613
-0
lines changed

hologres/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
Hologres is an all-in-one real-time data warehouse engine that is compatible with PostgreSQL. It supports online analytical processing (OLAP) and ad hoc analysis of PB-scale data. Hologres supports online data serving at high concurrency and low latency.
2+
3+
To evaluate the performance of Hologres, follow these guidelines to set up and execute the benchmark tests.
4+
5+
1. **Instance Purchase**:
6+
Refer to the [Alibaba Cloud Hologres TPC-H Testing Documentation](https://www.alibabacloud.com/help/en/hologres/user-guide/test-plan?spm=a2c63.p38356.help-menu-113622.d_2_14_0_0.54e14f70oTAEXO) for details on purchasing Hologres and ECS instances. Both instances must be purchased within the same region and same zone.
7+
8+
2. **Benchmark Execution**:
9+
Once the instances are set up, you need to prepare the following parameters:
10+
- `user`: user name for hologres, you can create users on Hologres web console
11+
- `password`: password for hologres, you can set this when create users
12+
- `host_name`: hostname of the Hologres instance, you can find this on Alibaba Cloud Console, you should select VPC network to achieve best performance
13+
- `port`: Port of the Hologres instance (usally '80')
14+
15+
And then setup environments variables:
16+
```
17+
export PG_USER={user};export PG_PASSWORD={password};export PG_HOSTNAME={host_name};export PG_PORT={port}
18+
```
19+
20+
3. **Sample Execution**:
21+
```bash
22+
./main.sh 5 /root/bluesky
23+
```

hologres/benchmark.sh

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 1 ]]; then
5+
echo "Usage: $0 <DB_NAME> [RESULT_FILE]"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
RESULT_FILE="${2:-}"
12+
13+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
14+
15+
# Construct the query log file name using $DB_NAME
16+
# QUERY_LOG_FILE="${OUTPUT_PREFIX}_query_log_${DB_NAME}.txt"
17+
QUERY_LOG_FILE="${OUTPUT_PREFIX}_${DB_NAME}.query_log"
18+
19+
# Print the database name
20+
echo "Running queries on database: $DB_NAME"
21+
22+
# Run queries and log the output
23+
./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE"
24+
25+
# Process the query log and prepare the result
26+
RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Time: \d+\.\d+ ms' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' | \
27+
awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
28+
29+
# Output the result
30+
if [[ -n "$RESULT_FILE" ]]; then
31+
echo "$RESULT" > "$RESULT_FILE"
32+
echo "Result written to $RESULT_FILE"
33+
else
34+
echo "$RESULT"
35+
fi
36+
37+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"

hologres/count.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 2 ]]; then
5+
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
TABLE_NAME="$2"
12+
13+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
14+
15+
# Corrected SQL query
16+
$HOLOGRES_PSQL -d "$DB_NAME" -t -c "SELECT count(*) from $TABLE_NAME"
17+
18+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"

hologres/create_and_load.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
3+
# set -e
4+
5+
# Check if the required arguments are provided
6+
if [[ $# -lt 7 ]]; then
7+
echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
8+
exit 1
9+
fi
10+
11+
# Arguments
12+
DB_NAME="$1"
13+
TABLE_NAME="$2"
14+
DDL_FILE="$3"
15+
DATA_DIRECTORY="$4"
16+
NUM_FILES="$5"
17+
SUCCESS_LOG="$6"
18+
ERROR_LOG="$7"
19+
20+
# Validate arguments
21+
[[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
22+
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
23+
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
24+
25+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
26+
27+
echo "Drop and create database"
28+
$HOLOGRES_PSQL -c "DROP DATABASE IF EXISTS $DB_NAME" -c "CREATE DATABASE $DB_NAME"
29+
echo "Disable result cache."
30+
$HOLOGRES_PSQL -c "ALTER DATABASE $DB_NAME SET hg_experimental_enable_result_cache TO off;"
31+
32+
echo "Execute DDL"
33+
$HOLOGRES_PSQL -d "$DB_NAME" -t < "$DDL_FILE"
34+
35+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Load data"
36+
./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
37+
38+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Vacuum analyze the table"
39+
$HOLOGRES_PSQL -d "$DB_NAME" -c '\timing' -c "VACUUM $TABLE_NAME"
40+
$HOLOGRES_PSQL -d "$DB_NAME" -c '\timing' -c "ANALYZE $TABLE_NAME"
41+
$HOLOGRES_PSQL -d "$DB_NAME" -c '\timing' -c "select hologres.hg_full_compact_table('$TABLE_NAME')"
42+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"

hologres/ddl.sql

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
CREATE TABLE bluesky (
2+
data JSONB NOT NULL
3+
);
4+
5+
ALTER TABLE bluesky ALTER COLUMN data SET (enable_columnar_type = ON);
6+
CALL set_table_property('bluesky', 'dictionary_encoding_columns', 'data:auto');
7+
CALL set_table_property('bluesky', 'bitmap_columns', 'data:auto');

hologres/drop_tables.sh

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 1 ]]; then
5+
echo "Usage: $0 <DB_NAME>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
12+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
13+
14+
# echo "Dropping database"
15+
$HOLOGRES_PSQL -c "DROP DATABASE $DB_NAME"
16+
17+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"

hologres/index_usage.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 1 ]]; then
5+
echo "Usage: $0 <DB_NAME>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
EXPLAIN_CMD="$2"
12+
13+
QUERY_NUM=1
14+
15+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
16+
17+
cat queries.sql | while read -r query; do
18+
19+
# Print the query number
20+
echo "------------------------------------------------------------------------------------------------------------------------"
21+
echo "Index usage for query Q$QUERY_NUM:"
22+
echo
23+
24+
$HOLOGRES_PSQL -d "$DB_NAME" -t -c "$EXPLAIN_CMD $query"
25+
26+
# Increment the query number
27+
QUERY_NUM=$((QUERY_NUM + 1))
28+
29+
done;
30+
31+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"

hologres/install.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
# https://www.postgresql.org/download/linux/ubuntu/
4+
5+
sudo apt-get update
6+
sudo apt-get install -y postgresql-common postgresql-16

hologres/load_data.sh

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#!/bin/bash
2+
3+
# set -e
4+
5+
# Check if the required arguments are provided
6+
if [[ $# -lt 6 ]]; then
7+
echo "Usage: $0 <directory> <database_name> <table_name> <max_files> <success_log> <error_log>"
8+
exit 1
9+
fi
10+
11+
# Arguments
12+
DIRECTORY="$1"
13+
DIRECTORY=`realpath $DIRECTORY`
14+
DB_NAME="$2"
15+
TABLE_NAME="$3"
16+
MAX_FILES="$4"
17+
SUCCESS_LOG="$5"
18+
ERROR_LOG="$6"
19+
PSQL_CMD="$HOLOGRES_PSQL -d $DB_NAME"
20+
21+
FORCE_REPROCESS=0
22+
SAVE_INTO_CACHE=1
23+
CACHE_DIR=${DIRECTORY}/cleaned
24+
25+
# Validate that MAX_FILES is a number
26+
if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
27+
echo "Error: <max_files> must be a positive integer."
28+
exit 1
29+
fi
30+
31+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") START"
32+
33+
# Ensure the log files exist
34+
touch "$SUCCESS_LOG" "$ERROR_LOG"
35+
echo "SUCCESS_LOG $SUCCESS_LOG"
36+
echo "ERROR_LOG $ERROR_LOG"
37+
38+
echo "---------------------------"
39+
echo "FORCE_REPROCESS=$FORCE_REPROCESS"
40+
echo "SAVE_INTO_CACHE=$SAVE_INTO_CACHE"
41+
echo "CACHE_DIR=$CACHE_DIR"
42+
echo "---------------------------"
43+
44+
# Create a temporary directory in /var/tmp and ensure it's accessible
45+
TEMP_DIR=$(mktemp -d /var/tmp/cleaned_files.XXXXXX)
46+
chmod 777 "$TEMP_DIR" # Allow access for all users
47+
trap "rm -rf $TEMP_DIR" EXIT # Ensure cleanup on script exit
48+
49+
# Counter to track processed files
50+
counter=0
51+
52+
# Loop through each .json.gz file in the directory
53+
for file in $(ls "$DIRECTORY"/*.json.gz | sort); do
54+
if [[ -f "$file" ]]; then
55+
56+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Processing $file ..."
57+
counter=$((counter + 1))
58+
59+
filename=$(basename "$file" .gz) # e.g., data.json
60+
cleaned_basename="${filename%.json}_cleaned.json" # e.g., data_cleaned.json
61+
62+
# 定义缓存文件路径(最终保存位置)
63+
cached_file=`realpath $CACHE_DIR/$cleaned_basename`
64+
65+
# 如果缓存文件已经存在,就不再处理
66+
if [[ -f "$cached_file" && "$FORCE_REPROCESS" == 0 ]]; then
67+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Cached file exists: $cached_file - skipping processing."
68+
cleaned_file="$cached_file"
69+
else
70+
# Uncompress the file into the temporary directory
71+
uncompressed_file="$TEMP_DIR/$filename"
72+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] gunzip: $file ..."
73+
gunzip -c "$file" > "$uncompressed_file"
74+
75+
# Check if uncompression was successful
76+
if [[ $? -ne 0 ]]; then
77+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." | tee -a "$ERROR_LOG"
78+
continue
79+
fi
80+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] gunzip done: $uncompressed_file"
81+
# head -n 1 "$uncompressed_file"
82+
83+
# Preprocess the file to remove null characters
84+
cleaned_file="$TEMP_DIR/$(basename "${uncompressed_file%.json}_cleaned.json")"
85+
cleaned_file_realpath=`realpath $cleaned_file`
86+
# sed 's/\\u0000//g' "$uncompressed_file" > "$cleaned_file"
87+
# 将跨越两行的 JSON 合并为一行(可以使导入成功率超过 99% )
88+
sed 's/\\u0000//g' "$uncompressed_file" | awk 'NR == 1 { printf "%s", $0; next } /^{/ { printf "\n%s", $0; next } { printf "%s", $0 } END { print "" }' > "$cleaned_file"
89+
90+
# head -n 1 "$cleaned_file"
91+
92+
# Grant read permissions for the postgres user
93+
chmod 644 "$cleaned_file"
94+
95+
if [[ "$SAVE_INTO_CACHE" != 0 ]]; then
96+
# 将 clean 后的文件保存到指定目录作为缓存
97+
mkdir -p "$CACHE_DIR"
98+
cp "$cleaned_file" "$cached_file"
99+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Saved cleaned file to cache: `realpath $cached_file`"
100+
fi
101+
fi
102+
103+
# cp "$cleaned_file" /tmp/1.json
104+
echo `wc -l $cleaned_file`
105+
106+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Start importing $cleaned_file into Hologres." | tee -a "$SUCCESS_LOG"
107+
108+
max_retries=3
109+
timeout_seconds=90
110+
attempt=1
111+
112+
# Import the cleaned JSON file into Hologres
113+
114+
until [ $attempt -gt $max_retries ]; do
115+
echo "($attempt) Try to copy data ..."
116+
timeout $timeout_seconds $PSQL_CMD -c "\COPY $TABLE_NAME FROM '$cleaned_file' WITH (format csv, quote e'\x01', delimiter e'\x02', escape e'\x01');"
117+
118+
import_status=$?
119+
120+
if [ $import_status -ne 124 ]; then
121+
break
122+
fi
123+
124+
attempt=$((attempt + 1))
125+
sleep 1
126+
done
127+
128+
# Check if the import was successful
129+
if [[ $import_status -eq 0 ]]; then
130+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $cleaned_file into Hologres." | tee -a "$SUCCESS_LOG"
131+
# Delete both the uncompressed and cleaned files after successful processing
132+
rm -f "$uncompressed_file" "$cleaned_file_realpath"
133+
else
134+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $cleaned_file. See errors above." | tee -a "$ERROR_LOG"
135+
# Keep the files for debugging purposes
136+
fi
137+
138+
# Stop processing if the max number of files is reached
139+
if [[ $counter -ge $MAX_FILES ]]; then
140+
echo "Processed maximum number of files: $MAX_FILES"
141+
break
142+
fi
143+
else
144+
echo "No .json.gz files found in the directory."
145+
fi
146+
done
147+
148+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $(basename "$0") DONE"

0 commit comments

Comments
 (0)