Skip to content

Commit e488178

Browse files
authored
RO replicas recovery improvement (#146)
* RO replicas: a script that automates the whole recovery procedure per replica * min fixes
1 parent 7f5a466 commit e488178

File tree

1 file changed

+277
-51
lines changed

1 file changed

+277
-51
lines changed

content/en/altinity-kb-setup-and-maintenance/altinity-kb-check-replication-ddl-queue.md

Lines changed: 277 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -98,84 +98,310 @@ FORMAT TSVRaw;
9898
Sometimes, due to crashes, zookeeper unavailability, slowness, or other reasons, some of the tables can be in Read-Only mode. This allows SELECTS but not INSERTS. So we need to do DROP / RESTORE replica procedure.
9999

100100
Just to be clear, this procedure **will not delete any data**, it will just re-create the metadata in zookeeper with the current state of the [ClickHouse replica](/altinity-kb-setup-and-maintenance/altinity-kb-data-migration/add_remove_replica/).
101+
102+
How it works:
101103

102104
```sql
103105
ALTER TABLE table_name DROP DETACHED PARTITION ALL -- clean detached folder before operation. PARTITION ALL works only for the fresh clickhouse versions
104106
DETACH TABLE table_name; -- Required for DROP REPLICA
105-
-- Use the zookeeper_path and replica_name from the above query.
107+
-- Use the zookeeper_path and replica_name from system.replicas.
106108
SYSTEM DROP REPLICA 'replica_name' FROM ZKPATH '/table_path_in_zk'; -- It will remove everything from the /table_path_in_zk/replicas/replica_name
107109
ATTACH TABLE table_name; -- Table will be in readonly mode, because there is no metadata in ZK and after that execute
108110
SYSTEM RESTORE REPLICA table_name; -- It will detach all partitions, re-create metadata in ZK (like it's new empty table), and then attach all partitions back
109-
SYSTEM SYNC REPLICA table_name; -- Wait for replicas to synchronize parts. Also it's recommended to check `system.detached_parts` on all replicas after recovery is finished.
110-
SELECT name FROM system.detached_parts WHERE table = 'table_name'; -- check for leftovers. See the potential problem here - https://gist.github.com/den-crane/702e4c8a1162dae7c2edf48a7c2dd00d
111+
SYSTEM SYNC REPLICA table_name; -- Not mandatory. It will Wait for replicas to synchronize parts. Also it's recommended to check `system.detached_parts` on all replicas after recovery is finished.
112+
SELECT name FROM system.detached_parts WHERE table = 'table_name'; -- check for leftovers. See the potential problems here https://altinity.com/blog/understanding-detached-parts-in-clickhouse
111113
```
112114

113-
114115
Starting from version 23, it's possible to use syntax [SYSTEM DROP REPLICA \'replica_name\' FROM TABLE db.table](https://clickhouse.com/docs/en/sql-reference/statements/system#drop-replica) instead of the `ZKPATH` variant, but you need to execute the above command from a different replica than the one you want to drop, which is not convenient sometimes. We recommend using the above method because it works with any version and is more reliable.
115116

116-
## Procedure for many replicas generating DDL
117-
118-
```sql
119-
SELECT DISTINCT 'DETACH TABLE ' || database || '.' || table || ' ON CLUSTER \'data\';' FROM clusterAllReplicas('data',system.replicas) WHERE active_replicas < total_replicas FORMAT TSVRaw;
120-
121-
SELECT DISTINCT 'SYSTEM DROP REPLICA \'' || replica_name || '\' FROM ZKPATH \'' || zookeeper_path || '\';' FROM clusterAllReplicas('data',system.replicas) WHERE active_replicas < total_replicas FORMAT TSVRaw;
122-
123-
SELECT DISTINCT 'ATTACH TABLE ' || database || '.' || table || ' ON CLUSTER \'data\';' FROM clusterAllReplicas('data',system.replicas) WHERE active_replicas < total_replicas FORMAT TSVRaw;
117+
## Procedure to restore multiple tables in Read-Only mode per replica
124118

125-
SELECT DISTINCT 'SYSTEM RESTORE REPLICA ' || database || '.' || table || ' ON CLUSTER \'data\';' FROM clusterAllReplicas('data',system.replicas) WHERE active_replicas < total_replicas FORMAT TSVRaw;
119+
It is better to make an approach per replica, because restoring a replica using ON CLUSTER could lead to race conditions that would cause errors and a big stress in zookeeper/keeper
126120

127-
-- check detached parts afterwards
128-
SELECT * FROM clusterAllReplicas('data',system.detached_parts)
129121

130-
-- make clickhouse 'forget' about the table (data persisted on disk)
131-
DETACH TABLE db.table ON CLUSTER '...';
132-
133-
-- remove the zookeeper data about that table in zookeeper
134-
SYSTEM DROP REPLICA 'replica_name' FROM ZKPATH '/path/to/table/in/zk'; -- run the commands generated before.
122+
```sql
123+
SELECT
124+
'-- Table ' || toString(row_num) || '\n' ||
125+
'DETACH TABLE `' || database || '`.`' || table || '`;\n' ||
126+
'SYSTEM DROP REPLICA ''' || replica_name || ''' FROM ZKPATH ''' || zookeeper_path || ''';\n' ||
127+
'ATTACH TABLE `' || database || '`.`' || table || '`;\n' ||
128+
'SYSTEM RESTORE REPLICA `' || database || '`.`' || table || '`;\n'
129+
FROM (
130+
SELECT
131+
*,
132+
rowNumberInAllBlocks() + 1 as row_num
133+
FROM (
134+
SELECT
135+
database,
136+
table,
137+
any(replica_name) as replica_name,
138+
any(zookeeper_path) as zookeeper_path
139+
FROM system.replicas
140+
WHERE is_readonly
141+
GROUP BY database, table
142+
ORDER BY database, table
143+
)
144+
ORDER BY database, table
145+
)
146+
FORMAT TSVRaw;
147+
```
135148

136-
-- register table in clickhouse again - it will be in readonly mode.
137-
ATTACH TABLE db.table ON CLUSTER '...';
149+
This will generate the DDL statements to be executed per replica and generate an ouput that can be saved as an SQL file . It is important to execute the commands per replica in the sequence generated by the above DDL:
138150

139-
-- recreate the zookeeper data from the
140-
SYSTEM RESTORE REPLICA db.name ON CLUSTER '...';
151+
- DETACH the table
152+
- DROP REPLICA
153+
- ATTACH the table
154+
- RESTORE REPLICA
141155

142-
--- do restart replica
156+
If we do this in parallel a table could still be attaching while another query is dropping/restoring the replica in zookeeper, causing errors.
143157

144-
SELECT DISTINCT 'clickhouse-client --host=' || left(hostName(),-2) || ' --query=\'SYSTEM RESTART REPLICA '||database || '.' || table|| '\'' FROM clusterAllReplicas('all-sharded', system.replication_queue) WHERE last_exception != '' and create_time > now() -130 FORMAT TSVRaw;
145-
```
146-
147-
Here a bash script that will do the same as above but tailored to a single replica, you can call it like `bash restore_replica.sh chi-clickhouse-cluster-main-cluster-1-3`:
158+
The following bash script will read the generated SQL file and execute the commands sequentially, asking for user input in case of errors. Simply save the generated SQL to a file (e.g. `recovery_commands.sql`) and run the script below (that you can name as `clickhouse_replica_recovery.sh`):
148159

149160
```bash
150-
#!/usr/bin/env bash
161+
$ clickhouse_replica_recovery.sh recovery_commands.sql
162+
```
151163

152-
#Call like bash restore_replica.sh chi-clickhouse-cluster-main-cluster-1-3
153164

154-
set -o errexit # exit on fail
155-
set -o pipefail # catch errors in pipelines
156-
set -o nounset # exit on undeclared variable
157-
set -o xtrace # trace execution
165+
Here the script:
158166

159-
restore_replica() {
160-
local chi_name=$1
161-
# assumes `chi-...-cluster-<shard>-<replica>` naming ou can change this patter to your needs
162-
local shard=$(echo $chi_name |grep -oP '(?<=cluster-)\d+(?=-\d+$)')
167+
```bash
168+
#!/bin/bash
169+
170+
# ClickHouse Replica Recovery Script
171+
# This script executes DETACH, DROP REPLICA, ATTACH, and RESTORE REPLICA commands sequentially
172+
173+
# Configuration
174+
CLICKHOUSE_HOST="${CLICKHOUSE_HOST:-localhost}"
175+
CLICKHOUSE_PORT="${CLICKHOUSE_PORT:-9000}"
176+
CLICKHOUSE_USER="${CLICKHOUSE_USER:-clickhouse_operator}"
177+
CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-xxxxxxxxx}"
178+
COMMANDS_FILE="${1:-recovery_commands.sql}"
179+
LOG_FILE="recovery_$(date +%Y%m%d_%H%M%S).log"
180+
181+
# Colors for output
182+
RED='\033[0;31m'
183+
GREEN='\033[0;32m'
184+
YELLOW='\033[1;33m'
185+
BLUE='\033[0;34m'
186+
MAGENTA='\033[0;35m'
187+
NC='\033[0m' # No Color
188+
189+
# Function to log messages
190+
log() {
191+
echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
192+
}
163193

194+
# Function to execute a SQL statement with retry logic
195+
execute_sql() {
196+
local sql="$1"
197+
local table_num="$2"
198+
local step_name="$3"
199+
164200
while true; do
165-
166-
clickhouse-client --host=${chi_name} --user=admin --password=the_admin_password --query="select concat(database, '.\`', table, '\`', ' ', database, '/', table) FROM system.replicas WHERE is_readonly = 1 ORDER BY database, table" |
167-
while read -r db_table zk_path; do
168-
clickhouse-client --host=${chi_name} --user=admin --password=the_admin_password --query="DETACH TABLE ${db_table}"
169-
clickhouse-client --host=${chi_name} --user=admin --password=the_admin_password --query="SYSTEM DROP REPLICA '"${chi_name}"' FROM ZKPATH '/clickhouse/tables/${shard}/${zk_path}'" || true
170-
clickhouse-client --host=${chi_name} --user=admin --password=the_admin_password --query="ATTACH TABLE ${db_table}"
171-
clickhouse-client --host=${chi_name} --user=admin --password=the_admin_password --query="SYSTEM RESTORE REPLICA ${db_table}"
172-
done || true
173-
174-
sleep 5
175-
201+
log "${YELLOW}Executing command for Table $table_num - $step_name:${NC}"
202+
log "$sql"
203+
204+
# Build clickhouse-client command
205+
local ch_cmd="clickhouse-client --host=$CLICKHOUSE_HOST --port=$CLICKHOUSE_PORT --user=$CLICKHOUSE_USER"
206+
207+
if [ -n "$CLICKHOUSE_PASSWORD" ]; then
208+
ch_cmd="$ch_cmd --password=$CLICKHOUSE_PASSWORD"
209+
fi
210+
211+
# Execute the command and capture output and exit code
212+
local output
213+
local exit_code
214+
output=$(echo "$sql" | $ch_cmd 2>&1)
215+
exit_code=$?
216+
217+
# Log the output
218+
echo "$output" | tee -a "$LOG_FILE"
219+
220+
if [ $exit_code -eq 0 ]; then
221+
log "${GREEN}✓ Successfully executed${NC}"
222+
return 0
223+
else
224+
log "${RED}✗ Failed to execute (Exit code: $exit_code)${NC}"
225+
log "${RED}Error output: $output${NC}"
226+
227+
# Ask user what to do
228+
while true; do
229+
echo ""
230+
log "${MAGENTA}========================================${NC}"
231+
log "${MAGENTA}Error occurred! Choose an option:${NC}"
232+
log "${MAGENTA}========================================${NC}"
233+
echo -e "${YELLOW}[R]${NC} - Retry this command"
234+
echo -e "${YELLOW}[I]${NC} - Ignore this error and continue to next command in this table"
235+
echo -e "${YELLOW}[S]${NC} - Skip this entire table and move to next table"
236+
echo -e "${YELLOW}[A]${NC} - Abort script execution"
237+
echo ""
238+
echo -n "Enter your choice (R/I/S/A): "
239+
240+
# Read from /dev/tty to get user input from terminal
241+
read -r response < /dev/tty
242+
243+
case "${response^^}" in
244+
R|RETRY)
245+
log "${BLUE}Retrying command...${NC}"
246+
break # Break inner loop to retry
247+
;;
248+
I|IGNORE)
249+
log "${YELLOW}Ignoring error and continuing to next command...${NC}"
250+
return 1 # Return error but continue
251+
;;
252+
S|SKIP)
253+
log "${YELLOW}Skipping entire table $table_num...${NC}"
254+
return 2 # Return special code to skip table
255+
;;
256+
A|ABORT)
257+
log "${RED}Aborting script execution...${NC}"
258+
exit 1
259+
;;
260+
*)
261+
echo -e "${RED}Invalid option '$response'. Please enter R, I, S, or A.${NC}"
262+
;;
263+
esac
264+
done
265+
fi
176266
done
177267
}
178268

179-
restore_replica "$@"
269+
# Main execution function
270+
main() {
271+
log "${BLUE}========================================${NC}"
272+
log "${BLUE}ClickHouse Replica Recovery Script${NC}"
273+
log "${BLUE}========================================${NC}"
274+
log "Host: $CLICKHOUSE_HOST:$CLICKHOUSE_PORT"
275+
log "User: $CLICKHOUSE_USER"
276+
log "Commands file: $COMMANDS_FILE"
277+
log "Log file: $LOG_FILE"
278+
echo ""
279+
280+
# Check if commands file exists
281+
if [ ! -f "$COMMANDS_FILE" ]; then
282+
log "${RED}Error: Commands file '$COMMANDS_FILE' not found!${NC}"
283+
echo ""
284+
echo "Usage: $0 [commands_file]"
285+
echo " commands_file: Path to SQL commands file (default: recovery_commands.sql)"
286+
echo ""
287+
echo "Example: $0 my_commands.sql"
288+
exit 1
289+
fi
290+
291+
# Process SQL commands from file
292+
local current_sql=""
293+
local table_counter=0
294+
local step_in_table=0
295+
local failed_count=0
296+
local success_count=0
297+
local ignored_count=0
298+
local skipped_tables=()
299+
local skip_current_table=false
300+
301+
while IFS= read -r line || [ -n "$line" ]; do
302+
# Skip empty lines
303+
if [[ -z "$line" ]] || [[ "$line" =~ ^[[:space:]]*$ ]]; then
304+
continue
305+
fi
306+
307+
# Check if this is a comment line indicating a new table
308+
if [[ "$line" =~ ^[[:space:]]*--[[:space:]]*Table[[:space:]]+([0-9]+) ]]; then
309+
table_counter="${BASH_REMATCH[1]}"
310+
step_in_table=0
311+
skip_current_table=false
312+
log ""
313+
log "${BLUE}========================================${NC}"
314+
log "${BLUE}Processing Table $table_counter${NC}"
315+
log "${BLUE}========================================${NC}"
316+
continue
317+
elif [[ "$line" =~ ^[[:space:]]*-- ]]; then
318+
# Skip other comment lines
319+
continue
320+
fi
321+
322+
# Skip if we're skipping this table
323+
if [ "$skip_current_table" = true ]; then
324+
# Check if line ends with semicolon to count statements
325+
if [[ "$line" =~ \;[[:space:]]*$ ]]; then
326+
step_in_table=$((step_in_table + 1))
327+
fi
328+
continue
329+
fi
330+
331+
# Accumulate the SQL statement
332+
current_sql+="$line "
333+
334+
# Check if we have a complete statement (ends with semicolon)
335+
if [[ "$line" =~ \;[[:space:]]*$ ]]; then
336+
step_in_table=$((step_in_table + 1))
337+
338+
# Determine the step name
339+
local step_name=""
340+
if [[ "$current_sql" =~ ^[[:space:]]*DETACH ]]; then
341+
step_name="DETACH"
342+
elif [[ "$current_sql" =~ ^[[:space:]]*SYSTEM[[:space:]]+DROP[[:space:]]+REPLICA ]]; then
343+
step_name="DROP REPLICA"
344+
elif [[ "$current_sql" =~ ^[[:space:]]*ATTACH ]]; then
345+
step_name="ATTACH"
346+
elif [[ "$current_sql" =~ ^[[:space:]]*SYSTEM[[:space:]]+RESTORE[[:space:]]+REPLICA ]]; then
347+
step_name="RESTORE REPLICA"
348+
fi
349+
350+
log ""
351+
log "Step $step_in_table/4: $step_name"
352+
353+
# Execute the statement
354+
local result
355+
execute_sql "$current_sql" "$table_counter" "$step_name"
356+
result=$?
357+
358+
if [ $result -eq 0 ]; then
359+
success_count=$((success_count + 1))
360+
sleep 1 # Small delay between commands
361+
elif [ $result -eq 1 ]; then
362+
# User chose to ignore this error
363+
failed_count=$((failed_count + 1))
364+
ignored_count=$((ignored_count + 1))
365+
sleep 1
366+
elif [ $result -eq 2 ]; then
367+
# User chose to skip this table
368+
skip_current_table=true
369+
skipped_tables+=("$table_counter")
370+
log "${YELLOW}Skipping remaining commands for Table $table_counter${NC}"
371+
fi
372+
373+
# Reset current_sql for next statement
374+
current_sql=""
375+
fi
376+
done < "$COMMANDS_FILE"
377+
378+
# Summary
379+
log ""
380+
log "${BLUE}========================================${NC}"
381+
log "${BLUE}Execution Summary${NC}"
382+
log "${BLUE}========================================${NC}"
383+
log "Total successful commands: ${GREEN}$success_count${NC}"
384+
log "Total failed commands: ${RED}$failed_count${NC}"
385+
log "Total ignored errors: ${YELLOW}$ignored_count${NC}"
386+
log "Total tables processed: $table_counter"
387+
388+
if [ ${#skipped_tables[@]} -gt 0 ]; then
389+
log "Skipped tables: ${YELLOW}${skipped_tables[*]}${NC}"
390+
fi
391+
392+
log "Log file: $LOG_FILE"
393+
394+
if [ $failed_count -eq 0 ]; then
395+
log "${GREEN}All commands executed successfully!${NC}"
396+
exit 0
397+
else
398+
log "${YELLOW}Some commands failed or were ignored. Please check the log file.${NC}"
399+
exit 1
400+
fi
401+
}
402+
403+
# Run the main function
404+
main
405+
180406
```
181407

0 commit comments

Comments
 (0)