Skip to content

Commit cd13374

Browse files
gnolongapecloud-bot
authored andcommitted
fix: mongodb restore and pre-terminate hang (#2335)
(cherry picked from commit 01f044f)
1 parent 292aa71 commit cd13374

File tree

6 files changed

+116
-59
lines changed

6 files changed

+116
-59
lines changed

addons/mongodb/dataprotection/common-scripts.sh

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,3 +352,66 @@ function process_restore_end_signal() {
352352
done
353353
echo "INFO: Prepare restore end signal completed."
354354
}
355+
356+
function get_describe_backup_info() {
357+
max_retries=360
358+
retry_interval=2
359+
attempt=1
360+
describe_result=""
361+
set +e
362+
while [ $attempt -le $max_retries ]; do
363+
describe_result=$(pbm describe-backup --mongodb-uri "$PBM_MONGODB_URI" "$backup_name" -o json 2>&1)
364+
if [ $? -eq 0 ] && [ -n "$describe_result" ]; then
365+
break
366+
elif echo "$describe_result" | grep -q "not found"; then
367+
echo "INFO: Attempt $attempt: backup $backup_name not found, retrying in ${retry_interval}s..."
368+
if [ $((attempt % 30)) -eq 29 ]; then
369+
echo "INFO: Sync PBM config from storage again."
370+
sync_pbm_config_from_storage
371+
fi
372+
sleep $retry_interval
373+
((attempt++))
374+
continue
375+
else
376+
echo "ERROR: Failed to get backup metadata: $describe_result"
377+
exit 1
378+
fi
379+
done
380+
set -e
381+
382+
if [ -z "$describe_result" ] || echo "$describe_result" | grep -q "not found"; then
383+
echo "ERROR: Failed to get backup metadata after $max_retries attempts"
384+
exit 1
385+
fi
386+
}
387+
388+
function wait_for_restoring() {
389+
local cnf_file="${MOUNT_DIR}/tmp/pbm_restore.cnf"
390+
cat <<EOF > ${MOUNT_DIR}/tmp/pbm_restore.cnf
391+
storage:
392+
type: s3
393+
s3:
394+
region: ${S3_REGION}
395+
bucket: ${S3_BUCKET}
396+
prefix: ${S3_PREFIX}
397+
endpointUrl: ${S3_ENDPOINT}
398+
forcePathStyle: ${S3_FORCE_PATH_STYLE:-false}
399+
credentials:
400+
access-key-id: ${S3_ACCESS_KEY}
401+
secret-access-key: ${S3_SECRET_KEY}
402+
EOF
403+
404+
while true; do
405+
restore_status=$(pbm describe-restore "$restore_name" -c $cnf_file -o json | jq -r '.status')
406+
echo "INFO: Restore $restore_name status: $restore_status"
407+
if [ "$restore_status" = "done" ]; then
408+
rm $cnf_file
409+
break
410+
elif [ "$restore_status" = "" ] || [ "$restore_status" = "starting" ] || [ "$restore_status" = "running" ]; then
411+
sleep 5
412+
else
413+
rm $cnf_file
414+
exit 1
415+
fi
416+
done
417+
}

addons/mongodb/dataprotection/rs-pbm-full-restore.sh

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -37,31 +37,7 @@ if [ -z "$backup_type" ] || [ -z "$backup_name" ]; then
3737
exit 1
3838
fi
3939

40-
MAX_RETRIES=360
41-
RETRY_INTERVAL=2
42-
attempt=1
43-
describe_result=""
44-
set +e
45-
while [ $attempt -le $MAX_RETRIES ]; do
46-
describe_result=$(pbm describe-backup --mongodb-uri "$PBM_MONGODB_URI" "$backup_name" -o json 2>&1)
47-
if [ $? -eq 0 ] && [ -n "$describe_result" ]; then
48-
break
49-
elif echo "$describe_result" | grep -q "not found"; then
50-
echo "INFO: Attempt $attempt: Failed to get backup metadata, retrying in ${RETRY_INTERVAL}s..."
51-
sleep $RETRY_INTERVAL
52-
((attempt++))
53-
continue
54-
else
55-
echo "ERROR: Failed to get backup metadata: $describe_result"
56-
exit 1
57-
fi
58-
done
59-
set -e
60-
61-
if [ -z "$describe_result" ] || echo "$describe_result" | grep -q "not found"; then
62-
echo "ERROR: Failed to get backup metadata after $MAX_RETRIES attempts"
63-
exit 1
64-
fi
40+
get_describe_backup_info
6541

6642
rs_name=$(echo "$describe_result" | jq -r '.replsets[0].name')
6743
mappings="$MONGODB_REPLICA_SET_NAME=$rs_name"
@@ -71,6 +47,8 @@ process_restore_start_signal
7147

7248
wait_for_other_operations
7349

74-
pbm restore $backup_name --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" --wait
50+
restore_name=$(pbm restore $backup_name --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" -o json | jq -r '.name')
51+
52+
wait_for_restoring
7553

7654
process_restore_end_signal

addons/mongodb/dataprotection/rs-pbm-pitr-restore.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ sync_pbm_storage_config
2828

2929
sync_pbm_config_from_storage
3030

31-
3231
process_restore_start_signal
3332

3433
extras=$(cat /dp_downward/status_extras)
@@ -43,7 +42,9 @@ echo "INFO: Starting restore..."
4342

4443
wait_for_other_operations
4544

46-
pbm restore --time="$recovery_target_time" --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" --wait
45+
restore_name=$(pbm restore --time="$recovery_target_time" --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" -o json | jq -r '.name')
46+
47+
wait_for_restoring
4748

4849
process_restore_end_signal
4950

addons/mongodb/dataprotection/shard-pbm-full-restore.sh

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -37,31 +37,7 @@ if [ -z "$backup_type" ] || [ -z "$backup_name" ]; then
3737
exit 1
3838
fi
3939

40-
MAX_RETRIES=360
41-
RETRY_INTERVAL=2
42-
attempt=1
43-
describe_result=""
44-
set +e
45-
while [ $attempt -le $MAX_RETRIES ]; do
46-
describe_result=$(pbm describe-backup --mongodb-uri "$PBM_MONGODB_URI" "$backup_name" -o json 2>&1)
47-
if [ $? -eq 0 ] && [ -n "$describe_result" ]; then
48-
break
49-
elif echo "$describe_result" | grep -q "not found"; then
50-
echo "INFO: Attempt $attempt: Failed to get backup metadata, retrying in ${RETRY_INTERVAL}s..."
51-
sleep $RETRY_INTERVAL
52-
((attempt++))
53-
continue
54-
else
55-
echo "ERROR: Failed to get backup metadata: $describe_result"
56-
exit 1
57-
fi
58-
done
59-
set -e
60-
61-
if [ -z "$describe_result" ] || echo "$describe_result" | grep -q "not found"; then
62-
echo "ERROR: Failed to get backup metadata after $MAX_RETRIES attempts"
63-
exit 1
64-
fi
40+
get_describe_backup_info
6541

6642
configsvr_name=$(echo "$describe_result" | jq -r '.replsets[] | select(.configsvr == true) | .name')
6743
echo "INFO: Config server replica set name: $configsvr_name"
@@ -103,6 +79,8 @@ process_restore_start_signal
10379

10480
wait_for_other_operations
10581

106-
pbm restore $backup_name --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" --wait
82+
restore_name=$(pbm restore $backup_name --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" -o json | jq -r '.name')
83+
84+
wait_for_restoring
10785

10886
process_restore_end_signal

addons/mongodb/dataprotection/shard-pbm-pitr-restore.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,9 @@ echo "INFO: Starting restore..."
7575

7676
wait_for_other_operations
7777

78-
pbm restore --time="$recovery_target_time" --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" --wait
78+
restore_name=$(pbm restore --time="$recovery_target_time" --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" -o json | jq -r '.name')
79+
80+
wait_for_restoring
7981

8082
process_restore_end_signal
8183

addons/mongodb/scripts/mongodb-shard-manage.sh

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@ wait_for_mongos() {
3131
check_shard_exists() {
3232
# check if the shard exists in the config database
3333
local shard_exists
34-
shard_exists=$($CLUSTER_MONGO "db.getSiblingDB(\"config\").shards.find({ _id: \"$MONGODB_REPLICA_SET_NAME\" })")
34+
shard_exists=$($CLUSTER_MONGO "db.getSiblingDB(\"config\").shards.find({ _id: \"$MONGODB_REPLICA_SET_NAME\" })" 2>/dev/null)
35+
if [ $? -ne 0 ]; then
36+
echo "ERROR: Failed to check if shard $MONGODB_REPLICA_SET_NAME exists." >&2
37+
exit 1
38+
fi
39+
echo "INFO: Check if shard $MONGODB_REPLICA_SET_NAME exists: $shard_exists"
3540
if [ -n "$shard_exists" ]; then
3641
return 0 # true
3742
else
@@ -63,7 +68,11 @@ initialize_or_scale_out_mongodb_shard() {
6368
get_remove_shard_status() {
6469
# Execute the removeShard command and capture its JSON output
6570
local result
66-
result=$($CLUSTER_MONGO "EJSON.stringify(db.adminCommand( { removeShard: \"$MONGODB_REPLICA_SET_NAME\" } ))")
71+
if [ "$CLIENT" = "mongosh" ]; then
72+
result=$($CLUSTER_MONGO "EJSON.stringify(db.adminCommand( { removeShard: \"$MONGODB_REPLICA_SET_NAME\" } ))")
73+
else
74+
result=$($CLUSTER_MONGO "JSON.stringify(db.adminCommand( { removeShard: \"$MONGODB_REPLICA_SET_NAME\" } ))")
75+
fi
6776
echo "$result"
6877
}
6978

@@ -76,6 +85,32 @@ get_remove_shard_state() {
7685
echo "$state"
7786
}
7887

88+
get_remaining_jumbo_chunks() {
89+
local result=$1
90+
# Parse and log the jumboChunks count using jq
91+
local jumbo_chunks
92+
if [ "$CLIENT" = "mongosh" ]; then
93+
jumbo_chunks=$(echo "$result" | jq -r '.remaining.jumboChunks // 0')
94+
else
95+
jumbo_chunks=$(echo "$result" | jq -r '.remaining.jumboChunks.numberLong // 0')
96+
fi
97+
# Return the jumboChunks count as the function output
98+
echo "$jumbo_chunks"
99+
}
100+
101+
get_remaining_chunks() {
102+
local result=$1
103+
# Parse and log the chunks count using jq
104+
local chunks
105+
if [ "$CLIENT" = "mongosh" ]; then
106+
chunks=$(echo "$result" | jq -r '.remaining.chunks // 0')
107+
else
108+
chunks=$(echo "$result" | jq -r '.remaining.chunks.numberLong // 0')
109+
fi
110+
# Return the chunks count as the function output
111+
echo "$chunks"
112+
}
113+
79114
delete_or_scale_in_mongodb_shard() {
80115
# Check if the shard is scaling in
81116
if [[ $KB_CLUSTER_COMPONENT_IS_SCALING_IN != "true" ]]; then
@@ -111,13 +146,13 @@ delete_or_scale_in_mongodb_shard() {
111146
if [ "$state" = "completed" ]; then
112147
break
113148
elif [ "$state" = "ongoing" ]; then
114-
remaining_jumboChunks=$(echo "$status_json" | jq -r '.remaining.jumboChunks')
149+
remaining_jumboChunks=$(get_remaining_jumbo_chunks "$status_json")
115150
if [ "$remaining_jumboChunks" -gt 0 ]; then
116151
echo "INFO: $remaining_jumboChunks jumbo chunks remaining, please clear jumbo chunks before removing the shard."
117152
exit 1
118153
fi
119154

120-
remaining_chunks=$(echo "$status_json" | jq -r '.remaining.chunks')
155+
remaining_chunks=$(get_remaining_chunks "$status_json")
121156
echo "INFO: $remaining_chunks chunks remaining."
122157
if [ "$remaining_chunks" -eq 0 ]; then
123158
dbs_to_move=$(echo "$status_json" | jq -r '.dbsToMove[]')

0 commit comments

Comments
 (0)