diff --git a/go/config/config.go b/go/config/config.go index 9795383f..2c3ea7c2 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -242,6 +242,7 @@ type Configuration struct { PostMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses PostIntermediateMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses PostGracefulTakeoverProcesses []string // Processes to execute after runnign a graceful master takeover. Uses same placeholders as PostFailoverProcesses + PostUnsuccessfulGracefulTakeoverProcesses []string // Processes to execute after an unsuccessful graceful master takeover. Uses same placeholders as PostFailoverProcesses PostTakeMasterProcesses []string // Processes to execute after a successful Take-Master event has taken place RecoverNonWriteableMaster bool // When 'true', orchestrator treats a read-only master as a failure scenario and attempts to make the master writeable CoMasterRecoveryMustPromoteOtherCoMaster bool // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-master or else fail @@ -425,6 +426,7 @@ func newConfiguration() *Configuration { PostFailoverProcesses: []string{}, PostUnsuccessfulFailoverProcesses: []string{}, PostGracefulTakeoverProcesses: []string{}, + PostUnsuccessfulGracefulTakeoverProcesses: []string{}, PostTakeMasterProcesses: []string{}, RecoverNonWriteableMaster: false, CoMasterRecoveryMustPromoteOtherCoMaster: true, diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 7939ac8a..7226d21e 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -2172,29 +2172,24 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey, return nil, nil, err } demotedMasterSelfBinlogCoordinates := &clusterMaster.SelfBinlogCoordinates - log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates) - if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil { - return nil, nil, err - } - promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates - log.Infof("GracefulMasterTakeover: attempting recovery") - recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) + topologyRecovery, promotedMasterCoordinates, err = gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates, designatedInstance, analysisEntry) + if err != nil { - log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err) - } - if !recoveryAttempted { - return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen") - } - if topologyRecovery == nil { - return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen") - } - if topologyRecovery.SuccessorKey == nil { - // Promotion fails. - // Undo setting read-only on original master. - inst.SetReadOnly(&clusterMaster.Key, false) - return nil, nil, fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err) + log.Errorf("GracefulMasterTakeover: promotion failed. Will unset read-only on %+v", clusterMaster.Key) + + if _, unsetReadOnlyErr := inst.SetReadOnly(&clusterMaster.Key, false); unsetReadOnlyErr != nil { + log.Errorf("GracefulMasterTakeover: failed to unset read_only on %+v: %+v", clusterMaster.Key, unsetReadOnlyErr) + } + + if topologyRecovery == nil { + // If we failed to run the recovery, use the info we already have + topologyRecovery = preGracefulTakeoverTopologyRecovery + } + executeProcesses(config.Config.PostUnsuccessfulGracefulTakeoverProcesses, "PostUnsuccessfulGracefulTakeoverProcesses", topologyRecovery, false) + return nil, nil, err } + var gtidHint inst.OperationGTIDHint = inst.GTIDHintNeutral if topologyRecovery.RecoveryType == MasterRecoveryGTID { gtidHint = inst.GTIDHintForce @@ -2225,3 +2220,32 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey, return topologyRecovery, promotedMasterCoordinates, err } + +func gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates *inst.BinlogCoordinates, designatedInstance *inst.Instance, analysisEntry inst.ReplicationAnalysis) (topologyRecovery *TopologyRecovery, promotedMasterCoordinates *inst.BinlogCoordinates, err error) { + log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates) + if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil { + return nil, nil, err + } + promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates + + log.Infof("GracefulMasterTakeover: attempting recovery") + recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) + if err != nil { + log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err) + } + if !recoveryAttempted { + return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen") + } + if topologyRecovery == nil { + return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen") + } + + if topologyRecovery.SuccessorKey == nil { + err = fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err) + } else { + // Clear the error if all previous checks succeeded + err = nil + } + + return topologyRecovery, promotedMasterCoordinates, err +} diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/config.json b/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/config.json new file mode 100644 index 00000000..ddd3c533 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/config.json @@ -0,0 +1,5 @@ +{ + "PostUnsuccessfulGracefulTakeoverProcesses": [ + "echo 'Planned takeover failed for {failedHost}:{failedPort}'" + ] +} diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/run b/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/run new file mode 100644 index 00000000..393ba72a --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/run @@ -0,0 +1,2 @@ +# Store the current number of Orchestrator log lines +wc -l /tmp/orchestrator.log.lines diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/expect_output b/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/expect_output new file mode 100644 index 00000000..3a5fc21c --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/expect_output @@ -0,0 +1 @@ +127.0.0.1:10112 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/run b/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/run new file mode 100644 index 00000000..07786295 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/run @@ -0,0 +1 @@ +orchestrator-client -c stop-replica -i 127.0.0.1:10112 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/expect_failure b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/expect_failure new file mode 100644 index 00000000..fd1bb9d8 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/expect_failure @@ -0,0 +1 @@ +WaitForExecBinlogCoordinatesToReach: reached maxWait 20s on 127.0.0.1:10112 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/run b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/run new file mode 100644 index 00000000..cbc5d9d8 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/run @@ -0,0 +1,2 @@ +sleep 3 +orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/expect_output b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/expect_output new file mode 100644 index 00000000..2b4a9dea --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/expect_output @@ -0,0 +1,4 @@ +127.0.0.1:10111 |ok |rw +- 127.0.0.1:10112 |nonreplicating|ro + + 127.0.0.1:10113|ok |ro + + 127.0.0.1:10114|ok |ro diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/run b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/run new file mode 100644 index 00000000..7a0bc50e --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/run @@ -0,0 +1 @@ +orchestrator-client -c topology-tabulated -alias ci | cut -d'|' -f 1,3,5 diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output new file mode 100644 index 00000000..b586af03 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output @@ -0,0 +1,2 @@ +ERROR GracefulMasterTakeover: promotion failed. Will unset read-only on 127.0.0.1:10111 +INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook 1 of 1: echo 'Planned takeover failed for 127.0.0.1:10111' diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/run b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/run new file mode 100644 index 00000000..c0d3a1b0 --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/run @@ -0,0 +1,3 @@ +# Read the logs generated after the test started, and check for the expected messages +tail -n +$(cat /tmp/orchestrator.log.lines) /var/log/journal/orchestrator.service.log \ + | grep -oP '(ERROR GracefulMasterTakeover: promotion failed|INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook).*' diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/README.md b/tests/system/graceful-master-takeover-fail-replication-stopped/README.md new file mode 100644 index 00000000..a831a1dd --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/README.md @@ -0,0 +1,19 @@ +# Test the error handling when the master fails to take over gracefully and replication is stopped + +This error occurred in production when two takeovers were executed in a row. +The first take over was successful, but the second one failed. + +This error is reproducable by stopping replication on the replica that +is supposed to take over. Another way to reproduce this error is to +run two takeovers one immediately after the other: + +```sh +orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112 +orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10112 -d 127.0.0.1:10111 +``` + +In the end the topology will be in a partially failed state, with +replication stopped for replica `127.0.0.1:10112`, and the other two +replicas placed behind it. Though, `master` will still be writable, +and `PostUnsuccessfulGracefulTakeoverProcesses` hooks will be executed +to help the cluster recover. diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/skip_run b/tests/system/graceful-master-takeover-fail-replication-stopped/skip_run new file mode 100644 index 00000000..e69de29b diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/teardown b/tests/system/graceful-master-takeover-fail-replication-stopped/teardown new file mode 100644 index 00000000..deec9b2c --- /dev/null +++ b/tests/system/graceful-master-takeover-fail-replication-stopped/teardown @@ -0,0 +1,3 @@ +orchestrator-client -c start-replica -i 127.0.0.1:10112 +orchestrator-client -c relocate -i 127.0.0.1:10113 -d 127.0.0.1:10111 +orchestrator-client -c relocate -i 127.0.0.1:10114 -d 127.0.0.1:10111