Skip to content

Commit 66bea0c

Browse files
authored
Implement handling for corrupted nodes.conf in redis-cluster6-server-start.sh
Added functions to handle corrupted nodes.conf and backup.
1 parent d2d8151 commit 66bea0c

File tree

1 file changed

+48
-2
lines changed

1 file changed

+48
-2
lines changed

addons/redis/redis-cluster-scripts/redis-cluster6-server-start.sh

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,13 @@ get_current_comp_nodes_for_scale_out_replica() {
282282
# Note: During rebuild-instance, a new PVC is created without existing data and having the rebuild.flag file.
283283
# Therefore, we must rejoin this instance to the cluster as a secondary node.
284284
is_rebuild_instance() {
285+
286+
# Check if nodes.conf is corrupted
287+
if [[ -f /data/nodeconfcorrupted.flag ]]; then
288+
echo "Rebuild instance detected: nodes.conf is corrupted"
289+
return 0
290+
fi
291+
285292
# Early return if rebuild flag doesn't exist
286293
[[ ! -f /data/rebuild.flag ]] && return 1
287294

@@ -307,6 +314,13 @@ remove_rebuild_instance_flag() {
307314
fi
308315
}
309316

317+
remove_nodeconfcorrupted_flag() {
318+
if [ -f /data/nodeconfcorrupted.flag ]; then
319+
rm -f /data/nodeconfcorrupted.flag
320+
echo "remove nodeconfcorrupted.flag file succeeded!"
321+
fi
322+
}
323+
310324
# scale out replica of redis cluster shard if needed
311325
scale_redis_cluster_replica() {
312326
# Waiting for redis-server to start
@@ -388,13 +402,18 @@ scale_redis_cluster_replica() {
388402
# current_node_with_port do not use advertised svc and port, because advertised svc and port are not ready when Pod is not Ready.
389403
if is_rebuild_instance; then
390404
echo "Current instance is a rebuild-instance, forget node id in the cluster firstly."
391-
node_id=$(get_cluster_id_with_retry "$primary_node_endpoint" "$primary_node_port" "$CURRENT_POD_IP")
405+
#node_id=$(get_cluster_id_with_retry "$primary_node_endpoint" "$primary_node_port" "$CURRENT_POD_IP")
406+
if [ -f /data/nodeconfcorrupted.flag ]; then
407+
node_id=$(grep "$CURRENT_POD_IP" $(ls -t /data/nodes.conf.*.bak | head -1) | awk '{print $1}')
408+
else
409+
node_id=$(get_cluster_id_with_retry "$primary_node_endpoint" "$primary_node_port" "$CURRENT_POD_IP")
410+
fi
392411
if [ -z ${REDIS_DEFAULT_PASSWORD} ]; then
393412
redis-cli -p $service_port --cluster call $primary_node_endpoint_with_port cluster forget ${node_id}
394413
else
395414
redis-cli -p $service_port --cluster call $primary_node_endpoint_with_port cluster forget ${node_id} -a ${REDIS_DEFAULT_PASSWORD}
396415
fi
397-
fi
416+
fi
398417
current_node_with_port="$CURRENT_POD_IP:$service_port"
399418
replicated_output=$(secondary_replicated_to_primary "$current_node_with_port" "$primary_node_endpoint_with_port" "$primary_node_cluster_id")
400419
status=$?
@@ -420,6 +439,7 @@ scale_redis_cluster_replica() {
420439
if is_rebuild_instance; then
421440
echo "replicate the node $CURRENT_POD_IP to the primary node $primary_node_endpoint_with_port successfully in rebuild-instance, remove rebuild.flag file..."
422441
remove_rebuild_instance_flag
442+
remove_nodeconfcorrupted_flag
423443
fi
424444

425445
# Hacky: When the entire redis cluster is restarted, a hacky sleep is used to wait for all primaries to enter the restarting state
@@ -668,6 +688,31 @@ build_redis_conf() {
668688
build_redis_default_accounts
669689
}
670690

691+
check_and_backup_nodes_conf() {
692+
local LOG_FILE="/data/running.log"
693+
local NODES_CONF="/data/nodes.conf"
694+
local TIMESTAMP
695+
local MATCH_COUNT
696+
697+
TIMESTAMP=$(date +"%Y%m%d%H%M%S")
698+
699+
[ ! -f "$LOG_FILE" ] && return 0
700+
701+
MATCH_COUNT=$(tail -n 10 "$LOG_FILE" | grep -c "corrupted cluster config file")
702+
703+
# 已经备份过就不再处理
704+
ls ${NODES_CONF}.${TIMESTAMP}.bak >/dev/null 2>&1 && return 0
705+
706+
if [ "$MATCH_COUNT" -ge 1 ] && [ -f "$NODES_CONF" ]; then
707+
mv "$NODES_CONF" "${NODES_CONF}.${TIMESTAMP}.bak"
708+
echo "[WARN] corrupted cluster config detected, backup nodes.conf"
709+
touch /data/nodeconfcorrupted.flag
710+
echo "[WARN] corrupted cluster config detected, touch /data/nodeconfcorrupted.flag"
711+
exit 0
712+
fi
713+
}
714+
715+
671716
# This is magic for shellspec ut framework.
672717
# Sometime, functions are defined in a single shell script.
673718
# You will want to test it. but you do not want to run the script.
@@ -681,4 +726,5 @@ parse_redis_cluster_shard_announce_addr
681726
build_redis_conf
682727
# TODO: move to memberJoin action in the future
683728
scale_redis_cluster_replica &
729+
check_and_backup_nodes_conf &
684730
start_redis_server

0 commit comments

Comments
 (0)