@@ -282,6 +282,13 @@ get_current_comp_nodes_for_scale_out_replica() {
282282# Note: During rebuild-instance, a new PVC is created without existing data and having the rebuild.flag file.
283283# Therefore, we must rejoin this instance to the cluster as a secondary node.
284284is_rebuild_instance () {
285+
286+ # Check if nodes.conf is corrupted
287+ if [[ -f /data/nodeconfcorrupted.flag ]]; then
288+ echo " Rebuild instance detected: nodes.conf is corrupted"
289+ return 0
290+ fi
291+
285292 # Early return if rebuild flag doesn't exist
286293 [[ ! -f /data/rebuild.flag ]] && return 1
287294
@@ -307,6 +314,13 @@ remove_rebuild_instance_flag() {
307314 fi
308315}
309316
317+ remove_nodeconfcorrupted_flag () {
318+ if [ -f /data/nodeconfcorrupted.flag ]; then
319+ rm -f /data/nodeconfcorrupted.flag
320+ echo " remove nodeconfcorrupted.flag file succeeded!"
321+ fi
322+ }
323+
310324# scale out replica of redis cluster shard if needed
311325scale_redis_cluster_replica () {
312326 # Waiting for redis-server to start
@@ -388,13 +402,18 @@ scale_redis_cluster_replica() {
388402 # current_node_with_port do not use advertised svc and port, because advertised svc and port are not ready when Pod is not Ready.
389403 if is_rebuild_instance; then
390404 echo " Current instance is a rebuild-instance, forget node id in the cluster firstly."
391- node_id=$( get_cluster_id_with_retry " $primary_node_endpoint " " $primary_node_port " " $CURRENT_POD_IP " )
405+ # node_id=$(get_cluster_id_with_retry "$primary_node_endpoint" "$primary_node_port" "$CURRENT_POD_IP")
406+ if [ -f /data/nodeconfcorrupted.flag ]; then
407+ node_id=$( grep " $CURRENT_POD_IP " $( ls -t /data/nodes.conf.* .bak | head -1) | awk ' {print $1}' )
408+ else
409+ node_id=$( get_cluster_id_with_retry " $primary_node_endpoint " " $primary_node_port " " $CURRENT_POD_IP " )
410+ fi
392411 if [ -z ${REDIS_DEFAULT_PASSWORD} ]; then
393412 redis-cli -p $service_port --cluster call $primary_node_endpoint_with_port cluster forget ${node_id}
394413 else
395414 redis-cli -p $service_port --cluster call $primary_node_endpoint_with_port cluster forget ${node_id} -a ${REDIS_DEFAULT_PASSWORD}
396415 fi
397- fi
416+ fi
398417 current_node_with_port=" $CURRENT_POD_IP :$service_port "
399418 replicated_output=$( secondary_replicated_to_primary " $current_node_with_port " " $primary_node_endpoint_with_port " " $primary_node_cluster_id " )
400419 status=$?
@@ -420,6 +439,7 @@ scale_redis_cluster_replica() {
420439 if is_rebuild_instance; then
421440 echo " replicate the node $CURRENT_POD_IP to the primary node $primary_node_endpoint_with_port successfully in rebuild-instance, remove rebuild.flag file..."
422441 remove_rebuild_instance_flag
442+ remove_nodeconfcorrupted_flag
423443 fi
424444
425445 # Hacky: When the entire redis cluster is restarted, a hacky sleep is used to wait for all primaries to enter the restarting state
@@ -668,6 +688,31 @@ build_redis_conf() {
668688 build_redis_default_accounts
669689}
670690
691+ check_and_backup_nodes_conf () {
692+ local LOG_FILE=" /data/running.log"
693+ local NODES_CONF=" /data/nodes.conf"
694+ local TIMESTAMP
695+ local MATCH_COUNT
696+
697+ TIMESTAMP=$( date +" %Y%m%d%H%M%S" )
698+
699+ [ ! -f " $LOG_FILE " ] && return 0
700+
701+ MATCH_COUNT=$( tail -n 10 " $LOG_FILE " | grep -c " corrupted cluster config file" )
702+
703+ # 已经备份过就不再处理
704+ ls ${NODES_CONF} .${TIMESTAMP} .bak > /dev/null 2>&1 && return 0
705+
706+ if [ " $MATCH_COUNT " -ge 1 ] && [ -f " $NODES_CONF " ]; then
707+ mv " $NODES_CONF " " ${NODES_CONF} .${TIMESTAMP} .bak"
708+ echo " [WARN] corrupted cluster config detected, backup nodes.conf"
709+ touch /data/nodeconfcorrupted.flag
710+ echo " [WARN] corrupted cluster config detected, touch /data/nodeconfcorrupted.flag"
711+ exit 0
712+ fi
713+ }
714+
715+
671716# This is magic for shellspec ut framework.
672717# Sometime, functions are defined in a single shell script.
673718# You will want to test it. but you do not want to run the script.
@@ -681,4 +726,5 @@ parse_redis_cluster_shard_announce_addr
681726build_redis_conf
682727# TODO: move to memberJoin action in the future
683728scale_redis_cluster_replica &
729+ check_and_backup_nodes_conf &
684730start_redis_server
0 commit comments