-
Notifications
You must be signed in to change notification settings - Fork 45
Handle failures in GracefulMasterTakeover #44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 2 commits
747fa26
1e58191
3dba639
2955418
2bc6879
736a169
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2172,29 +2172,21 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey, | |
| return nil, nil, err | ||
| } | ||
| demotedMasterSelfBinlogCoordinates := &clusterMaster.SelfBinlogCoordinates | ||
| log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates) | ||
| if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil { | ||
| return nil, nil, err | ||
| } | ||
| promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates | ||
|
|
||
| log.Infof("GracefulMasterTakeover: attempting recovery") | ||
| recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) | ||
| topologyRecovery, promotedMasterCoordinates, err = gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates, designatedInstance, analysisEntry) | ||
|
|
||
| if err != nil { | ||
| log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err) | ||
| } | ||
| if !recoveryAttempted { | ||
| return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen") | ||
| } | ||
| if topologyRecovery == nil { | ||
| return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen") | ||
| } | ||
| if topologyRecovery.SuccessorKey == nil { | ||
| // Promotion fails. | ||
| // Undo setting read-only on original master. | ||
| log.Errorf("GracefulMasterTakeover: promotion failed. Will set %+v as read_write", clusterMaster.Key) | ||
| inst.SetReadOnly(&clusterMaster.Key, false) | ||
o-fedorov marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return nil, nil, fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err) | ||
|
|
||
| if topologyRecovery == nil { | ||
| // If we failed to run the recovery, use the info we already have | ||
| topologyRecovery = preGracefulTakeoverTopologyRecovery | ||
| } | ||
| executeProcesses(config.Config.PostUnsuccessfulGracefulTakeoverProcesses, "PostUnsuccessfulGracefulTakeoverProcesses", topologyRecovery, false) | ||
| return nil, nil, err | ||
| } | ||
|
|
||
| var gtidHint inst.OperationGTIDHint = inst.GTIDHintNeutral | ||
| if topologyRecovery.RecoveryType == MasterRecoveryGTID { | ||
| gtidHint = inst.GTIDHintForce | ||
|
|
@@ -2225,3 +2217,29 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey, | |
|
|
||
| return topologyRecovery, promotedMasterCoordinates, err | ||
| } | ||
|
|
||
| func gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates *inst.BinlogCoordinates, designatedInstance *inst.Instance, analysisEntry inst.ReplicationAnalysis) (topologyRecovery *TopologyRecovery, promotedMasterCoordinates *inst.BinlogCoordinates, err error) { | ||
| log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates) | ||
| if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil { | ||
| return nil, nil, err | ||
| } | ||
| promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates | ||
|
|
||
| log.Infof("GracefulMasterTakeover: attempting recovery") | ||
| recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @o-fedorov , I'm not sure about this error propagation and handling. In the original flow, if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you, @kamil-holubicki , I forgot to clear the error at the end of the function. Updated now. Note that in the original code, |
||
| if err != nil { | ||
| log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err) | ||
| } | ||
| if !recoveryAttempted { | ||
| return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen") | ||
| } | ||
| if topologyRecovery == nil { | ||
| return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen") | ||
| } | ||
|
|
||
| if topologyRecovery.SuccessorKey == nil { | ||
| err = fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err) | ||
| } | ||
|
|
||
| return topologyRecovery, promotedMasterCoordinates, err | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| # Store the current number of Orchestrator log lines | ||
| wc -l </var/log/journal/orchestrator.service.log > /tmp/orchestrator.log.lines |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 127.0.0.1:10112 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| orchestrator-client -c stop-replica -i 127.0.0.1:10112 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| WaitForExecBinlogCoordinatesToReach: reached maxWait 20s on 127.0.0.1:10112 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| sleep 3 | ||
| orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| 127.0.0.1:10111 |ok |rw | ||
| - 127.0.0.1:10112 |nonreplicating|ro | ||
| + 127.0.0.1:10113|ok |ro | ||
| + 127.0.0.1:10114|ok |ro |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| orchestrator-client -c topology-tabulated -alias ci | cut -d'|' -f 1,3,5 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| ERROR GracefulMasterTakeover: promotion failed. Will set 127.0.0.1:10111 as read_write | ||
| INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook 1 of 1: echo 'Planned takeover failed for 127.0.0.1:10111' |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| # Read the logs generated after the test started, and check for the expected messages | ||
| tail -n +$(cat /tmp/orchestrator.log.lines) /var/log/journal/orchestrator.service.log \ | ||
| | grep -oP '(ERROR GracefulMasterTakeover: promotion failed|INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook).*' |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| # Test the error handling when the master fails to take over gracefully and replication is stopped | ||
|
|
||
| This error occurred in production when two takeovers were executed in a row. | ||
| The first take over was successful, but the second one failed. | ||
|
|
||
| This error is reproducable by stopping replication on the replica that | ||
| is supposed to take over. Another way to reproduce this error is to | ||
| run two takeovers one immediately after the other: | ||
|
|
||
| ```sh | ||
| orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112 | ||
| orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10112 -d 127.0.0.1:10111 | ||
| ``` | ||
|
|
||
| In the end the topology will be in a partially failed state, with | ||
| replication stopped for replica `127.0.0.1:10112`, and the other two | ||
| replicas placed behind it. Though, `master` will still be writable, | ||
| and `PostUnsuccessfulGracefulTakeoverProcesses` hooks will be executed | ||
| to help the cluster recover. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| orchestrator-client -c start-replica -i 127.0.0.1:10112 | ||
| orchestrator-client -c relocate -i 127.0.0.1:10113 -d 127.0.0.1:10111 | ||
| orchestrator-client -c relocate -i 127.0.0.1:10114 -d 127.0.0.1:10111 |
Uh oh!
There was an error while loading. Please reload this page.