Skip to content

Commit 1a1faf9

Browse files
DISTMYSQL-208: Orchestrator GUI incorrectly shows recovery option for
intermediate database in chained replication https://jira.percona.com/browse/DISTMYSQL-208 Problem: If we've got replication chain A->B->C, and C is down, GUI shows 'Recover' dropdown for node B, but there is no possible recovery action available in such a case. The problem is well visible on upstream and Percona Orchestrator up to version v3.2.6-3. For next versions it is still there, but is hard to reproduce. Cause: The cause for hard reproduction on versions > v3.2.6-6 is caused by fix for DISTMYSQL-182. In case of the instance being unreachable, it causes skip of instance.Key storing, which is later needed for Orchestrator's backend update. This caused that 'last_checked' and 'last_seen' timestamps were updated incorrectly. The root cause of the main problem is the analysis logic in Analysis_dao.go:GetReplicationAnalysis(). The condition for setting AllIntermediateMasterReplicasNotReplicating does not check if there are any replicas reachable. So the case when all replicas are dead (no recovery action possible) and the case when some replicas are still reachable, but are not replicating (recovery action possible) are undistingushable. Solution: 1. Move instance.Key assignment before Ping 2. Improve the analysis logic. Report AllIntermediateMasterReplicasNotReplicating only if all replicas are not replicating, but there are still some reachable replicas.
1 parent 76a7e20 commit 1a1faf9

File tree

10 files changed

+28
-7
lines changed

10 files changed

+28
-7
lines changed

go/inst/analysis_dao.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -608,9 +608,9 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
608608
a.Analysis = AllIntermediateMasterReplicasFailingToConnectOrDead
609609
a.Description = "Intermediate master is reachable but all of its replicas are failing to connect"
610610
//
611-
} else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
611+
} else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
612612
a.Analysis = AllIntermediateMasterReplicasNotReplicating
613-
a.Description = "Intermediate master is reachable but none of its replicas is replicating"
613+
a.Description = "Intermediate master is reachable but none of its reachable replicas is replicating"
614614
//
615615
} else if a.IsBinlogServer && a.IsFailingToConnectToMaster {
616616
a.Analysis = BinlogServerFailingToConnectToMaster

go/inst/instance_dao.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -384,14 +384,17 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, bufferWrites bool,
384384
latency.Stop("instance")
385385
goto Cleanup
386386
}
387+
388+
// Even if the instance is dead, we need its key below to update
389+
// the backend database's timestamps
390+
instance.Key = *instanceKey
391+
387392
err = db.Ping()
388393
if err != nil {
389394
goto Cleanup
390395
}
391396
latency.Stop("instance")
392397

393-
instance.Key = *instanceKey
394-
395398
if isMaxScale, resolvedHostname, err = instance.checkMaxScale(db, latency); err != nil {
396399
// We do not "goto Cleanup" here, although it should be the correct flow.
397400
// Reason is 5.7's new security feature that requires GRANTs on performance_schema.session_variables.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
127.0.0.1:10113<127.0.0.1:10112
2+
127.0.0.1:10114<127.0.0.1:10113
3+
127.0.0.1:10111 |0s|ok
4+
+ 127.0.0.1:10112 |0s|ok
5+
+ 127.0.0.1:10113 |0s|ok
6+
+ 127.0.0.1:10114|0s|ok
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# 10111 -> 10112 -> 10113 -> 10114
2+
orchestrator-client -c relocate -i 127.0.0.1:10113 -d 127.0.0.1:10112
3+
orchestrator-client -c relocate -i 127.0.0.1:10114 -d 127.0.0.1:10113
4+
5+
orchestrator-client -c topology-tabulated -alias ci | cut -d'|' -f 1,2,3

tests/system/intermediate-master-replica-failure/02-down-slave-3/expect_output

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
orchestrator-client -c replication-analysis
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
mysqladmin -uci -pci -h 127.0.0.1 --port=10114 shutdown
6+
sleep 20

tests/system/intermediate-master-replica-failure/skip_run

Whitespace-only changes.

tests/system/intermediate-master-replica-failure/teardown_redeploy

Whitespace-only changes.

tests/system/test.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ test_single() {
193193
fi
194194

195195
# test steps:
196-
find "$tests_path/$test_name" ! -path . -type d -mindepth 1 -maxdepth 1 | sort | cut -d "/" -f 5 | while read test_step_name ; do
196+
find "$tests_path/$test_name" -mindepth 1 -maxdepth 1 ! -path . -type d | sort | cut -d "/" -f 5 | while read test_step_name ; do
197197
[ "$test_step_name" == "." ] && continue
198198
test_step "$tests_path/$test_name/$test_step_name" "$test_name" "$test_step_name"
199199
if [ $? -ne 0 ] ; then
@@ -307,7 +307,7 @@ test_all() {
307307
while [ -s $tests_todo_file ] ; do
308308
echo -n > $tests_todo_file
309309

310-
find $tests_path ! -path . -type d -mindepth 1 -maxdepth 1 | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do
310+
find $tests_path -mindepth 1 -maxdepth 1 ! -path . -type d | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do
311311
if ! test_listed_as_attempted "$test_name" ; then
312312
echo "$test_name" >> $tests_todo_file
313313
fi
@@ -329,7 +329,7 @@ test_all() {
329329
fi
330330
done || return 1
331331
done
332-
find $tests_path ! -path . -type d -mindepth 1 -maxdepth 1 | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do
332+
find $tests_path -mindepth 1 -maxdepth 1 ! -path . -type d | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do
333333
if ! test_listed_as_attempted "$test_name" ; then
334334
echo "# ERROR: tests completed by $test_name seems to have been skipped"
335335
exit 1

0 commit comments

Comments
 (0)