Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ type Configuration struct {
PostMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
PostIntermediateMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
PostGracefulTakeoverProcesses []string // Processes to execute after runnign a graceful master takeover. Uses same placeholders as PostFailoverProcesses
PostUnsuccessfulGracefulTakeoverProcesses []string // Processes to execute after an unsuccessful graceful master takeover. Uses same placeholders as PostFailoverProcesses
PostTakeMasterProcesses []string // Processes to execute after a successful Take-Master event has taken place
RecoverNonWriteableMaster bool // When 'true', orchestrator treats a read-only master as a failure scenario and attempts to make the master writeable
CoMasterRecoveryMustPromoteOtherCoMaster bool // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-master or else fail
Expand Down Expand Up @@ -425,6 +426,7 @@ func newConfiguration() *Configuration {
PostFailoverProcesses: []string{},
PostUnsuccessfulFailoverProcesses: []string{},
PostGracefulTakeoverProcesses: []string{},
PostUnsuccessfulGracefulTakeoverProcesses: []string{},
PostTakeMasterProcesses: []string{},
RecoverNonWriteableMaster: false,
CoMasterRecoveryMustPromoteOtherCoMaster: true,
Expand Down
56 changes: 37 additions & 19 deletions go/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -2172,29 +2172,21 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey,
return nil, nil, err
}
demotedMasterSelfBinlogCoordinates := &clusterMaster.SelfBinlogCoordinates
log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates)
if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil {
return nil, nil, err
}
promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates

log.Infof("GracefulMasterTakeover: attempting recovery")
recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false)
topologyRecovery, promotedMasterCoordinates, err = gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates, designatedInstance, analysisEntry)

if err != nil {
log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err)
}
if !recoveryAttempted {
return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen")
}
if topologyRecovery == nil {
return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen")
}
if topologyRecovery.SuccessorKey == nil {
// Promotion fails.
// Undo setting read-only on original master.
log.Errorf("GracefulMasterTakeover: promotion failed. Will set %+v as read_write", clusterMaster.Key)
inst.SetReadOnly(&clusterMaster.Key, false)
return nil, nil, fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err)

if topologyRecovery == nil {
// If we failed to run the recovery, use the info we already have
topologyRecovery = preGracefulTakeoverTopologyRecovery
}
executeProcesses(config.Config.PostUnsuccessfulGracefulTakeoverProcesses, "PostUnsuccessfulGracefulTakeoverProcesses", topologyRecovery, false)
return nil, nil, err
}

var gtidHint inst.OperationGTIDHint = inst.GTIDHintNeutral
if topologyRecovery.RecoveryType == MasterRecoveryGTID {
gtidHint = inst.GTIDHintForce
Expand Down Expand Up @@ -2225,3 +2217,29 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey,

return topologyRecovery, promotedMasterCoordinates, err
}

func gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates *inst.BinlogCoordinates, designatedInstance *inst.Instance, analysisEntry inst.ReplicationAnalysis) (topologyRecovery *TopologyRecovery, promotedMasterCoordinates *inst.BinlogCoordinates, err error) {
log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates)
if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil {
return nil, nil, err
}
promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates

log.Infof("GracefulMasterTakeover: attempting recovery")
recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @o-fedorov , I'm not sure about this error propagation and handling. In the original flow, if ForceExecuteRecovery returned err, but also returned recoveryAttempted and topologyRecovery, GracefulMasterTakeover continued.
Now it makes master R/W (which is fine), but then returns.

Copy link
Contributor Author

@o-fedorov o-fedorov Dec 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, @kamil-holubicki , I forgot to clear the error at the end of the function. Updated now.

Note that in the original code, err is ignored below the refactored part, and is eventually overridden.

if err != nil {
log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err)
}
if !recoveryAttempted {
return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen")
}
if topologyRecovery == nil {
return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen")
}

if topologyRecovery.SuccessorKey == nil {
err = fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err)
}

return topologyRecovery, promotedMasterCoordinates, err
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Store the current number of Orchestrator log lines
wc -l </var/log/journal/orchestrator.service.log > /tmp/orchestrator.log.lines
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
127.0.0.1:10112
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
orchestrator-client -c stop-replica -i 127.0.0.1:10112
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
WaitForExecBinlogCoordinatesToReach: reached maxWait 20s on 127.0.0.1:10112
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sleep 3
orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
127.0.0.1:10111 |ok |rw
- 127.0.0.1:10112 |nonreplicating|ro
+ 127.0.0.1:10113|ok |ro
+ 127.0.0.1:10114|ok |ro
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
orchestrator-client -c topology-tabulated -alias ci | cut -d'|' -f 1,3,5
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ERROR GracefulMasterTakeover: promotion failed. Will set 127.0.0.1:10111 as read_write
INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook 1 of 1: echo 'Planned takeover failed for 127.0.0.1:10111'
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Read the logs generated after the test started, and check for the expected messages
tail -n +$(cat /tmp/orchestrator.log.lines) /var/log/journal/orchestrator.service.log \
| grep -oP '(ERROR GracefulMasterTakeover: promotion failed|INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook).*'
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Test the error handling when the master fails to take over gracefully and replication is stopped

This error occurred in production when two takeovers were executed in a row.
The first take over was successful, but the second one failed.

This error is reproducable by stopping replication on the replica that
is supposed to take over. Another way to reproduce this error is to
run two takeovers one immediately after the other:

```sh
orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112
orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10112 -d 127.0.0.1:10111
```

In the end the topology will be in a partially failed state, with
replication stopped for replica `127.0.0.1:10112`, and the other two
replicas placed behind it. Though, `master` will still be writable,
and `PostUnsuccessfulGracefulTakeoverProcesses` hooks will be executed
to help the cluster recover.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
orchestrator-client -c start-replica -i 127.0.0.1:10112
orchestrator-client -c relocate -i 127.0.0.1:10113 -d 127.0.0.1:10111
orchestrator-client -c relocate -i 127.0.0.1:10114 -d 127.0.0.1:10111
3 changes: 3 additions & 0 deletions tests/system/orchestrator-ci-system.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@
"PostGracefulTakeoverProcesses": [
"echo 'Planned takeover complete' >> /tmp/recovery.log"
],
"PostUnsuccessfulGracefulTakeoverProcesses": [
"echo 'Planned takeover failed for {failedHost}:{failedPort}'"
],
"CoMasterRecoveryMustPromoteOtherCoMaster": true,
"DetachLostSlavesAfterMasterFailover": true,
"ApplyMySQLPromotionAfterMasterFailover": true,
Expand Down