Skip to content

Commit 9874435

Browse files
gnolongapecloud-bot
authored andcommitted
fix: mongodb restore and pre-terminate hang (#2335)
(cherry picked from commit 01f044f)
1 parent e12f629 commit 9874435

File tree

6 files changed

+116
-59
lines changed

6 files changed

+116
-59
lines changed

addons/mongodb/dataprotection/common-scripts.sh

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,3 +366,66 @@ function process_restore_end_signal() {
366366
done
367367
echo "INFO: Prepare restore end signal completed."
368368
}
369+
370+
function get_describe_backup_info() {
371+
max_retries=360
372+
retry_interval=2
373+
attempt=1
374+
describe_result=""
375+
set +e
376+
while [ $attempt -le $max_retries ]; do
377+
describe_result=$(pbm describe-backup --mongodb-uri "$PBM_MONGODB_URI" "$backup_name" -o json 2>&1)
378+
if [ $? -eq 0 ] && [ -n "$describe_result" ]; then
379+
break
380+
elif echo "$describe_result" | grep -q "not found"; then
381+
echo "INFO: Attempt $attempt: backup $backup_name not found, retrying in ${retry_interval}s..."
382+
if [ $((attempt % 30)) -eq 29 ]; then
383+
echo "INFO: Sync PBM config from storage again."
384+
sync_pbm_config_from_storage
385+
fi
386+
sleep $retry_interval
387+
((attempt++))
388+
continue
389+
else
390+
echo "ERROR: Failed to get backup metadata: $describe_result"
391+
exit 1
392+
fi
393+
done
394+
set -e
395+
396+
if [ -z "$describe_result" ] || echo "$describe_result" | grep -q "not found"; then
397+
echo "ERROR: Failed to get backup metadata after $max_retries attempts"
398+
exit 1
399+
fi
400+
}
401+
402+
function wait_for_restoring() {
403+
local cnf_file="${MOUNT_DIR}/tmp/pbm_restore.cnf"
404+
cat <<EOF > ${MOUNT_DIR}/tmp/pbm_restore.cnf
405+
storage:
406+
type: s3
407+
s3:
408+
region: ${S3_REGION}
409+
bucket: ${S3_BUCKET}
410+
prefix: ${S3_PREFIX}
411+
endpointUrl: ${S3_ENDPOINT}
412+
forcePathStyle: ${S3_FORCE_PATH_STYLE:-false}
413+
credentials:
414+
access-key-id: ${S3_ACCESS_KEY}
415+
secret-access-key: ${S3_SECRET_KEY}
416+
EOF
417+
418+
while true; do
419+
restore_status=$(pbm describe-restore "$restore_name" -c $cnf_file -o json | jq -r '.status')
420+
echo "INFO: Restore $restore_name status: $restore_status"
421+
if [ "$restore_status" = "done" ]; then
422+
rm $cnf_file
423+
break
424+
elif [ "$restore_status" = "" ] || [ "$restore_status" = "starting" ] || [ "$restore_status" = "running" ]; then
425+
sleep 5
426+
else
427+
rm $cnf_file
428+
exit 1
429+
fi
430+
done
431+
}

addons/mongodb/dataprotection/rs-pbm-full-restore.sh

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -37,31 +37,7 @@ if [ -z "$backup_type" ] || [ -z "$backup_name" ]; then
3737
exit 1
3838
fi
3939

40-
MAX_RETRIES=360
41-
RETRY_INTERVAL=2
42-
attempt=1
43-
describe_result=""
44-
set +e
45-
while [ $attempt -le $MAX_RETRIES ]; do
46-
describe_result=$(pbm describe-backup --mongodb-uri "$PBM_MONGODB_URI" "$backup_name" -o json 2>&1)
47-
if [ $? -eq 0 ] && [ -n "$describe_result" ]; then
48-
break
49-
elif echo "$describe_result" | grep -q "not found"; then
50-
echo "INFO: Attempt $attempt: Failed to get backup metadata, retrying in ${RETRY_INTERVAL}s..."
51-
sleep $RETRY_INTERVAL
52-
((attempt++))
53-
continue
54-
else
55-
echo "ERROR: Failed to get backup metadata: $describe_result"
56-
exit 1
57-
fi
58-
done
59-
set -e
60-
61-
if [ -z "$describe_result" ] || echo "$describe_result" | grep -q "not found"; then
62-
echo "ERROR: Failed to get backup metadata after $MAX_RETRIES attempts"
63-
exit 1
64-
fi
40+
get_describe_backup_info
6541

6642
rs_name=$(echo "$describe_result" | jq -r '.replsets[0].name')
6743
mappings="$MONGODB_REPLICA_SET_NAME=$rs_name"
@@ -71,6 +47,8 @@ process_restore_start_signal
7147

7248
wait_for_other_operations
7349

74-
pbm restore $backup_name --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" --wait
50+
restore_name=$(pbm restore $backup_name --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" -o json | jq -r '.name')
51+
52+
wait_for_restoring
7553

7654
process_restore_end_signal

addons/mongodb/dataprotection/rs-pbm-pitr-restore.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ sync_pbm_storage_config
2828

2929
sync_pbm_config_from_storage
3030

31-
3231
process_restore_start_signal
3332

3433
extras=$(cat /dp_downward/status_extras)
@@ -43,7 +42,9 @@ echo "INFO: Starting restore..."
4342

4443
wait_for_other_operations
4544

46-
pbm restore --time="$recovery_target_time" --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" --wait
45+
restore_name=$(pbm restore --time="$recovery_target_time" --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" -o json | jq -r '.name')
46+
47+
wait_for_restoring
4748

4849
process_restore_end_signal
4950

addons/mongodb/dataprotection/shard-pbm-full-restore.sh

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -37,31 +37,7 @@ if [ -z "$backup_type" ] || [ -z "$backup_name" ]; then
3737
exit 1
3838
fi
3939

40-
MAX_RETRIES=360
41-
RETRY_INTERVAL=2
42-
attempt=1
43-
describe_result=""
44-
set +e
45-
while [ $attempt -le $MAX_RETRIES ]; do
46-
describe_result=$(pbm describe-backup --mongodb-uri "$PBM_MONGODB_URI" "$backup_name" -o json 2>&1)
47-
if [ $? -eq 0 ] && [ -n "$describe_result" ]; then
48-
break
49-
elif echo "$describe_result" | grep -q "not found"; then
50-
echo "INFO: Attempt $attempt: Failed to get backup metadata, retrying in ${RETRY_INTERVAL}s..."
51-
sleep $RETRY_INTERVAL
52-
((attempt++))
53-
continue
54-
else
55-
echo "ERROR: Failed to get backup metadata: $describe_result"
56-
exit 1
57-
fi
58-
done
59-
set -e
60-
61-
if [ -z "$describe_result" ] || echo "$describe_result" | grep -q "not found"; then
62-
echo "ERROR: Failed to get backup metadata after $MAX_RETRIES attempts"
63-
exit 1
64-
fi
40+
get_describe_backup_info
6541

6642
configsvr_name=$(echo "$describe_result" | jq -r '.replsets[] | select(.configsvr == true) | .name')
6743
echo "INFO: Config server replica set name: $configsvr_name"
@@ -104,6 +80,8 @@ process_restore_start_signal
10480

10581
wait_for_other_operations
10682

107-
pbm restore $backup_name --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" --wait
83+
restore_name=$(pbm restore $backup_name --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" -o json | jq -r '.name')
84+
85+
wait_for_restoring
10886

10987
process_restore_end_signal

addons/mongodb/dataprotection/shard-pbm-pitr-restore.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,9 @@ echo "INFO: Starting restore..."
7676

7777
wait_for_other_operations
7878

79-
pbm restore --time="$recovery_target_time" --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" --wait
79+
restore_name=$(pbm restore --time="$recovery_target_time" --mongodb-uri "$PBM_MONGODB_URI" --replset-remapping "$mappings" -o json | jq -r '.name')
80+
81+
wait_for_restoring
8082

8183
process_restore_end_signal
8284

addons/mongodb/scripts/mongodb-shard-manage.sh

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@ wait_for_mongos() {
3131
check_shard_exists() {
3232
# check if the shard exists in the config database
3333
local shard_exists
34-
shard_exists=$($CLUSTER_MONGO "db.getSiblingDB(\"config\").shards.find({ _id: \"$MONGODB_REPLICA_SET_NAME\" })")
34+
shard_exists=$($CLUSTER_MONGO "db.getSiblingDB(\"config\").shards.find({ _id: \"$MONGODB_REPLICA_SET_NAME\" })" 2>/dev/null)
35+
if [ $? -ne 0 ]; then
36+
echo "ERROR: Failed to check if shard $MONGODB_REPLICA_SET_NAME exists." >&2
37+
exit 1
38+
fi
39+
echo "INFO: Check if shard $MONGODB_REPLICA_SET_NAME exists: $shard_exists"
3540
if [ -n "$shard_exists" ]; then
3641
return 0 # true
3742
else
@@ -63,7 +68,11 @@ initialize_or_scale_out_mongodb_shard() {
6368
get_remove_shard_status() {
6469
# Execute the removeShard command and capture its JSON output
6570
local result
66-
result=$($CLUSTER_MONGO "EJSON.stringify(db.adminCommand( { removeShard: \"$MONGODB_REPLICA_SET_NAME\" } ))")
71+
if [ "$CLIENT" = "mongosh" ]; then
72+
result=$($CLUSTER_MONGO "EJSON.stringify(db.adminCommand( { removeShard: \"$MONGODB_REPLICA_SET_NAME\" } ))")
73+
else
74+
result=$($CLUSTER_MONGO "JSON.stringify(db.adminCommand( { removeShard: \"$MONGODB_REPLICA_SET_NAME\" } ))")
75+
fi
6776
echo "$result"
6877
}
6978

@@ -76,6 +85,32 @@ get_remove_shard_state() {
7685
echo "$state"
7786
}
7887

88+
get_remaining_jumbo_chunks() {
89+
local result=$1
90+
# Parse and log the jumboChunks count using jq
91+
local jumbo_chunks
92+
if [ "$CLIENT" = "mongosh" ]; then
93+
jumbo_chunks=$(echo "$result" | jq -r '.remaining.jumboChunks // 0')
94+
else
95+
jumbo_chunks=$(echo "$result" | jq -r '.remaining.jumboChunks.numberLong // 0')
96+
fi
97+
# Return the jumboChunks count as the function output
98+
echo "$jumbo_chunks"
99+
}
100+
101+
get_remaining_chunks() {
102+
local result=$1
103+
# Parse and log the chunks count using jq
104+
local chunks
105+
if [ "$CLIENT" = "mongosh" ]; then
106+
chunks=$(echo "$result" | jq -r '.remaining.chunks // 0')
107+
else
108+
chunks=$(echo "$result" | jq -r '.remaining.chunks.numberLong // 0')
109+
fi
110+
# Return the chunks count as the function output
111+
echo "$chunks"
112+
}
113+
79114
delete_or_scale_in_mongodb_shard() {
80115
# Check if the shard is scaling in
81116
if [[ $KB_CLUSTER_COMPONENT_IS_SCALING_IN != "true" ]]; then
@@ -111,13 +146,13 @@ delete_or_scale_in_mongodb_shard() {
111146
if [ "$state" = "completed" ]; then
112147
break
113148
elif [ "$state" = "ongoing" ]; then
114-
remaining_jumboChunks=$(echo "$status_json" | jq -r '.remaining.jumboChunks')
149+
remaining_jumboChunks=$(get_remaining_jumbo_chunks "$status_json")
115150
if [ "$remaining_jumboChunks" -gt 0 ]; then
116151
echo "INFO: $remaining_jumboChunks jumbo chunks remaining, please clear jumbo chunks before removing the shard."
117152
exit 1
118153
fi
119154

120-
remaining_chunks=$(echo "$status_json" | jq -r '.remaining.chunks')
155+
remaining_chunks=$(get_remaining_chunks "$status_json")
121156
echo "INFO: $remaining_chunks chunks remaining."
122157
if [ "$remaining_chunks" -eq 0 ]; then
123158
dbs_to_move=$(echo "$status_json" | jq -r '.dbsToMove[]')

0 commit comments

Comments
 (0)