From 747fa26d34013427e2864225a03889ef4c843ddc Mon Sep 17 00:00:00 2001 From: Oleksandr Fedorov Date: Mon, 1 Jul 2024 15:47:24 -0400 Subject: [PATCH 1/6] Handle failures in GracefulMasterTakeover 1. Revert `read_only` for master in case of an error. 2. Introduce `PostUnsuccessfulGracefulTakeoverProcesses`. --- go/config/config.go | 2 ++ go/logic/topology_recovery.go | 56 +++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/go/config/config.go b/go/config/config.go index 9795383f..2c3ea7c2 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -242,6 +242,7 @@ type Configuration struct { PostMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses PostIntermediateMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses PostGracefulTakeoverProcesses []string // Processes to execute after runnign a graceful master takeover. Uses same placeholders as PostFailoverProcesses + PostUnsuccessfulGracefulTakeoverProcesses []string // Processes to execute after an unsuccessful graceful master takeover. Uses same placeholders as PostFailoverProcesses PostTakeMasterProcesses []string // Processes to execute after a successful Take-Master event has taken place RecoverNonWriteableMaster bool // When 'true', orchestrator treats a read-only master as a failure scenario and attempts to make the master writeable CoMasterRecoveryMustPromoteOtherCoMaster bool // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-master or else fail @@ -425,6 +426,7 @@ func newConfiguration() *Configuration { PostFailoverProcesses: []string{}, PostUnsuccessfulFailoverProcesses: []string{}, PostGracefulTakeoverProcesses: []string{}, + PostUnsuccessfulGracefulTakeoverProcesses: []string{}, PostTakeMasterProcesses: []string{}, RecoverNonWriteableMaster: false, CoMasterRecoveryMustPromoteOtherCoMaster: true, diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 7939ac8a..b190d76b 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -2172,29 +2172,21 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey, return nil, nil, err } demotedMasterSelfBinlogCoordinates := &clusterMaster.SelfBinlogCoordinates - log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates) - if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil { - return nil, nil, err - } - promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates - log.Infof("GracefulMasterTakeover: attempting recovery") - recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) + topologyRecovery, promotedMasterCoordinates, err = gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates, designatedInstance, analysisEntry) + if err != nil { - log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err) - } - if !recoveryAttempted { - return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen") - } - if topologyRecovery == nil { - return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen") - } - if topologyRecovery.SuccessorKey == nil { - // Promotion fails. - // Undo setting read-only on original master. + log.Errorf("GracefulMasterTakeover: promotion failed. Will set %+v as read_write", clusterMaster.Key) inst.SetReadOnly(&clusterMaster.Key, false) - return nil, nil, fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err) + + if topologyRecovery == nil { + // If we failed to run the recovery, use the info we already have + topologyRecovery = preGracefulTakeoverTopologyRecovery + } + executeProcesses(config.Config.PostUnsuccessfulGracefulTakeoverProcesses, "PostUnsuccessfulGracefulTakeoverProcesses", topologyRecovery, false) + return nil, nil, err } + var gtidHint inst.OperationGTIDHint = inst.GTIDHintNeutral if topologyRecovery.RecoveryType == MasterRecoveryGTID { gtidHint = inst.GTIDHintForce @@ -2225,3 +2217,29 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey, return topologyRecovery, promotedMasterCoordinates, err } + +func gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates *inst.BinlogCoordinates, designatedInstance *inst.Instance, analysisEntry inst.ReplicationAnalysis) (topologyRecovery *TopologyRecovery, promotedMasterCoordinates *inst.BinlogCoordinates, err error) { + log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates) + if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil { + return nil, nil, err + } + promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates + + log.Infof("GracefulMasterTakeover: attempting recovery") + recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) + if err != nil { + log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err) + } + if !recoveryAttempted { + return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen") + } + if topologyRecovery == nil { + return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen") + } + + if topologyRecovery.SuccessorKey == nil { + err = fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err) + } + + return topologyRecovery, promotedMasterCoordinates, err +} From 1e5819162a99344af7d98b1404e01eba5ddf10d0 Mon Sep 17 00:00:00 2001 From: Oleksandr Fedorov Date: Wed, 3 Jul 2024 16:15:19 -0400 Subject: [PATCH 2/6] Add tests --- .../01-record-log-position/run | 2 ++ .../02-stop-replication/expect_output | 1 + .../02-stop-replication/run | 1 + .../03-takeover/expect_failure | 1 + .../03-takeover/run | 2 ++ .../04-topology/expect_output | 4 ++++ .../04-topology/run | 1 + .../05-check-logs/expect_output | 2 ++ .../05-check-logs/run | 3 +++ .../README.md | 19 +++++++++++++++++++ .../skip_run | 0 .../teardown | 3 +++ tests/system/orchestrator-ci-system.conf.json | 3 +++ 13 files changed, 42 insertions(+) create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/run create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/expect_output create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/run create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/expect_failure create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/run create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/expect_output create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/run create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/run create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/README.md create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/skip_run create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/teardown diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/run b/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/run new file mode 100644 index 00000000..393ba72a --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/run @@ -0,0 +1,2 @@ +# Store the current number of Orchestrator log lines +wc -l /tmp/orchestrator.log.lines diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/expect_output b/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/expect_output new file mode 100644 index 00000000..3a5fc21c --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/expect_output @@ -0,0 +1 @@ +127.0.0.1:10112 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/run b/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/run new file mode 100644 index 00000000..07786295 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/run @@ -0,0 +1 @@ +orchestrator-client -c stop-replica -i 127.0.0.1:10112 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/expect_failure b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/expect_failure new file mode 100644 index 00000000..fd1bb9d8 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/expect_failure @@ -0,0 +1 @@ +WaitForExecBinlogCoordinatesToReach: reached maxWait 20s on 127.0.0.1:10112 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/run b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/run new file mode 100644 index 00000000..cbc5d9d8 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/run @@ -0,0 +1,2 @@ +sleep 3 +orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/expect_output b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/expect_output new file mode 100644 index 00000000..2b4a9dea --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/expect_output @@ -0,0 +1,4 @@ +127.0.0.1:10111 |ok |rw +- 127.0.0.1:10112 |nonreplicating|ro + + 127.0.0.1:10113|ok |ro + + 127.0.0.1:10114|ok |ro diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/run b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/run new file mode 100644 index 00000000..7a0bc50e --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/run @@ -0,0 +1 @@ +orchestrator-client -c topology-tabulated -alias ci | cut -d'|' -f 1,3,5 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output new file mode 100644 index 00000000..54cc2ace --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output @@ -0,0 +1,2 @@ +ERROR GracefulMasterTakeover: promotion failed. Will set 127.0.0.1:10111 as read_write +INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook 1 of 1: echo 'Planned takeover failed for 127.0.0.1:10111' diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/run b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/run new file mode 100644 index 00000000..c0d3a1b0 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/run @@ -0,0 +1,3 @@ +# Read the logs generated after the test started, and check for the expected messages +tail -n +$(cat /tmp/orchestrator.log.lines) /var/log/journal/orchestrator.service.log \ + | grep -oP '(ERROR GracefulMasterTakeover: promotion failed|INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook).*' diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/README.md b/tests/system/graceful-master-takeover-fail-replication-stopped/README.md new file mode 100644 index 00000000..a831a1dd --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/README.md @@ -0,0 +1,19 @@ +# Test the error handling when the master fails to take over gracefully and replication is stopped + +This error occurred in production when two takeovers were executed in a row. +The first take over was successful, but the second one failed. + +This error is reproducable by stopping replication on the replica that +is supposed to take over. Another way to reproduce this error is to +run two takeovers one immediately after the other: + +```sh +orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112 +orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10112 -d 127.0.0.1:10111 +``` + +In the end the topology will be in a partially failed state, with +replication stopped for replica `127.0.0.1:10112`, and the other two +replicas placed behind it. Though, `master` will still be writable, +and `PostUnsuccessfulGracefulTakeoverProcesses` hooks will be executed +to help the cluster recover. diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/skip_run b/tests/system/graceful-master-takeover-fail-replication-stopped/skip_run new file mode 100644 index 00000000..e69de29b diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/teardown b/tests/system/graceful-master-takeover-fail-replication-stopped/teardown new file mode 100644 index 00000000..deec9b2c --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/teardown @@ -0,0 +1,3 @@ +orchestrator-client -c start-replica -i 127.0.0.1:10112 +orchestrator-client -c relocate -i 127.0.0.1:10113 -d 127.0.0.1:10111 +orchestrator-client -c relocate -i 127.0.0.1:10114 -d 127.0.0.1:10111 diff --git a/tests/system/orchestrator-ci-system.conf.json b/tests/system/orchestrator-ci-system.conf.json index 18a083fc..17204154 100644 --- a/tests/system/orchestrator-ci-system.conf.json +++ b/tests/system/orchestrator-ci-system.conf.json @@ -76,6 +76,9 @@ "PostGracefulTakeoverProcesses": [ "echo 'Planned takeover complete' >> /tmp/recovery.log" ], + "PostUnsuccessfulGracefulTakeoverProcesses": [ + "echo 'Planned takeover failed for {failedHost}:{failedPort}'" + ], "CoMasterRecoveryMustPromoteOtherCoMaster": true, "DetachLostSlavesAfterMasterFailover": true, "ApplyMySQLPromotionAfterMasterFailover": true, From 3dba6390478a7f17bf060b2173b4d203f5a33002 Mon Sep 17 00:00:00 2001 From: Oleksandr Fedorov Date: Thu, 4 Jul 2024 17:08:23 -0400 Subject: [PATCH 3/6] Address comments: update logs and log SetReadOnly error --- go/logic/topology_recovery.go | 7 +++++-- .../05-check-logs/expect_output | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index b190d76b..f0f8da30 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -2176,8 +2176,11 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey, topologyRecovery, promotedMasterCoordinates, err = gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates, designatedInstance, analysisEntry) if err != nil { - log.Errorf("GracefulMasterTakeover: promotion failed. Will set %+v as read_write", clusterMaster.Key) - inst.SetReadOnly(&clusterMaster.Key, false) + log.Errorf("GracefulMasterTakeover: promotion failed. Will unset read-only on %+v", clusterMaster.Key) + + if _, unsetReadOnlyErr := inst.SetReadOnly(&clusterMaster.Key, false); unsetReadOnlyErr != nil { + log.Errorf("GracefulMasterTakeover: failed to unset read_only on %+v: %+v", clusterMaster.Key, unsetReadOnlyErr) + } if topologyRecovery == nil { // If we failed to run the recovery, use the info we already have diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output index 54cc2ace..b586af03 100644 --- a/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output @@ -1,2 +1,2 @@ -ERROR GracefulMasterTakeover: promotion failed. Will set 127.0.0.1:10111 as read_write +ERROR GracefulMasterTakeover: promotion failed. Will unset read-only on 127.0.0.1:10111 INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook 1 of 1: echo 'Planned takeover failed for 127.0.0.1:10111' From 295541844275294b03712d209871f5946f7c5e47 Mon Sep 17 00:00:00 2001 From: Oleksandr Fedorov Date: Thu, 12 Dec 2024 10:23:41 -0500 Subject: [PATCH 4/6] Reset error for ForceExecuteRecovery in gracefulMasterTakeover --- go/logic/topology_recovery.go | 1 + 1 file changed, 1 insertion(+) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index f0f8da30..50e77938 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -2232,6 +2232,7 @@ func gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates *inst.BinlogCoord recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) if err != nil { log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err) + err = nil } if !recoveryAttempted { return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen") From 2bc687954260b1b69b7e349218e9b2261b9ceb9d Mon Sep 17 00:00:00 2001 From: Oleksandr Fedorov Date: Thu, 12 Dec 2024 10:24:46 -0500 Subject: [PATCH 5/6] Move PostUnsuccessfulGracefulTakeoverProcesses config to a dedicated test --- .../01-record-log-position/config.json | 5 +++++ tests/system/orchestrator-ci-system.conf.json | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/config.json diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/config.json b/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/config.json new file mode 100644 index 00000000..ddd3c533 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/config.json @@ -0,0 +1,5 @@ +{ + "PostUnsuccessfulGracefulTakeoverProcesses": [ + "echo 'Planned takeover failed for {failedHost}:{failedPort}'" + ] +} diff --git a/tests/system/orchestrator-ci-system.conf.json b/tests/system/orchestrator-ci-system.conf.json index 17204154..18a083fc 100644 --- a/tests/system/orchestrator-ci-system.conf.json +++ b/tests/system/orchestrator-ci-system.conf.json @@ -76,9 +76,6 @@ "PostGracefulTakeoverProcesses": [ "echo 'Planned takeover complete' >> /tmp/recovery.log" ], - "PostUnsuccessfulGracefulTakeoverProcesses": [ - "echo 'Planned takeover failed for {failedHost}:{failedPort}'" - ], "CoMasterRecoveryMustPromoteOtherCoMaster": true, "DetachLostSlavesAfterMasterFailover": true, "ApplyMySQLPromotionAfterMasterFailover": true, From 736a1693835416deae516bf7937fe6cf203aa8da Mon Sep 17 00:00:00 2001 From: Oleksandr Fedorov Date: Thu, 12 Dec 2024 13:30:40 -0500 Subject: [PATCH 6/6] Clear the error in a proper place. --- go/logic/topology_recovery.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 50e77938..7226d21e 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -2232,7 +2232,6 @@ func gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates *inst.BinlogCoord recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) if err != nil { log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err) - err = nil } if !recoveryAttempted { return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen") @@ -2243,6 +2242,9 @@ func gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates *inst.BinlogCoord if topologyRecovery.SuccessorKey == nil { err = fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err) + } else { + // Clear the error if all previous checks succeeded + err = nil } return topologyRecovery, promotedMasterCoordinates, err