percona · o-fedorov · Jul 1, 2024 · Jul 3, 2024 · Jul 4, 2024 · Dec 12, 2024
diff --git a/go/config/config.go b/go/config/config.go
@@ -242,6 +242,7 @@ type Configuration struct {
 	PostMasterFailoverProcesses                []string          // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
 	PostIntermediateMasterFailoverProcesses    []string          // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
 	PostGracefulTakeoverProcesses              []string          // Processes to execute after runnign a graceful master takeover. Uses same placeholders as PostFailoverProcesses
+	PostUnsuccessfulGracefulTakeoverProcesses  []string          // Processes to execute after an unsuccessful graceful master takeover. Uses same placeholders as PostFailoverProcesses
 	PostTakeMasterProcesses                    []string          // Processes to execute after a successful Take-Master event has taken place
 	RecoverNonWriteableMaster                  bool              // When 'true', orchestrator treats a read-only master as a failure scenario and attempts to make the master writeable
 	CoMasterRecoveryMustPromoteOtherCoMaster   bool              // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-master or else fail
@@ -425,6 +426,7 @@ func newConfiguration() *Configuration {
 		PostFailoverProcesses:                      []string{},
 		PostUnsuccessfulFailoverProcesses:          []string{},
 		PostGracefulTakeoverProcesses:              []string{},
+		PostUnsuccessfulGracefulTakeoverProcesses:  []string{},
 		PostTakeMasterProcesses:                    []string{},
 		RecoverNonWriteableMaster:                  false,
 		CoMasterRecoveryMustPromoteOtherCoMaster:   true,

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
@@ -2172,29 +2172,21 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey,
 		return nil, nil, err
 	}
 	demotedMasterSelfBinlogCoordinates := &clusterMaster.SelfBinlogCoordinates
-	log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates)
-	if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil {
-		return nil, nil, err
-	}
-	promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates
 
-	log.Infof("GracefulMasterTakeover: attempting recovery")
-	recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false)
+	topologyRecovery, promotedMasterCoordinates, err = gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates, designatedInstance, analysisEntry)
+
 	if err != nil {
-		log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err)
-	}
-	if !recoveryAttempted {
-		return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen")
-	}
-	if topologyRecovery == nil {
-		return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen")
-	}
-	if topologyRecovery.SuccessorKey == nil {
-		// Promotion fails.
-		// Undo setting read-only on original master.
+		log.Errorf("GracefulMasterTakeover: promotion failed. Will set %+v as read_write", clusterMaster.Key)
 		inst.SetReadOnly(&clusterMaster.Key, false)
-		return nil, nil, fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err)
+
+		if topologyRecovery == nil {
+			// If we failed to run the recovery, use the info we already have
+			topologyRecovery = preGracefulTakeoverTopologyRecovery
+		}
+		executeProcesses(config.Config.PostUnsuccessfulGracefulTakeoverProcesses, "PostUnsuccessfulGracefulTakeoverProcesses", topologyRecovery, false)
+		return nil, nil, err
 	}
+
 	var gtidHint inst.OperationGTIDHint = inst.GTIDHintNeutral
 	if topologyRecovery.RecoveryType == MasterRecoveryGTID {
 		gtidHint = inst.GTIDHintForce
@@ -2225,3 +2217,29 @@ func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey,
 
 	return topologyRecovery, promotedMasterCoordinates, err
 }
+
+func gracefulMasterTakeover(demotedMasterSelfBinlogCoordinates *inst.BinlogCoordinates, designatedInstance *inst.Instance, analysisEntry inst.ReplicationAnalysis) (topologyRecovery *TopologyRecovery, promotedMasterCoordinates *inst.BinlogCoordinates, err error) {
+	log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates)
+	if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil {
+		return nil, nil, err
+	}
+	promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates
+
+	log.Infof("GracefulMasterTakeover: attempting recovery")
+	recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false)
+	if err != nil {
+		log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err)
+	}
+	if !recoveryAttempted {
+		return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen")
+	}
+	if topologyRecovery == nil {
+		return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen")
+	}
+
+	if topologyRecovery.SuccessorKey == nil {
+		err = fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err)
+	}
+
+	return topologyRecovery, promotedMasterCoordinates, err
+}
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/run b/tests/system/graceful-master-takeover-fail-replication-stopped/01-record-log-position/run
@@ -0,0 +1,2 @@
+# Store the current number of Orchestrator log lines
+wc -l </var/log/journal/orchestrator.service.log > /tmp/orchestrator.log.lines
diff --git a/...ystem/graceful-master-takeover-fail-replication-stopped/02-stop-replication/expect_output b/...ystem/graceful-master-takeover-fail-replication-stopped/02-stop-replication/expect_output
@@ -0,0 +1 @@
+127.0.0.1:10112
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/run b/tests/system/graceful-master-takeover-fail-replication-stopped/02-stop-replication/run
@@ -0,0 +1 @@
+orchestrator-client -c stop-replica -i 127.0.0.1:10112
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/expect_failure b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/expect_failure
@@ -0,0 +1 @@
+WaitForExecBinlogCoordinatesToReach: reached maxWait 20s on 127.0.0.1:10112
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/run b/tests/system/graceful-master-takeover-fail-replication-stopped/03-takeover/run
@@ -0,0 +1,2 @@
+sleep 3
+orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/expect_output b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/expect_output
@@ -0,0 +1,4 @@
+127.0.0.1:10111    |ok            |rw
+- 127.0.0.1:10112  |nonreplicating|ro
+  + 127.0.0.1:10113|ok            |ro
+  + 127.0.0.1:10114|ok            |ro
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/run b/tests/system/graceful-master-takeover-fail-replication-stopped/04-topology/run
@@ -0,0 +1 @@
+orchestrator-client -c topology-tabulated -alias ci | cut -d'|' -f 1,3,5
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/expect_output
@@ -0,0 +1,2 @@
+ERROR GracefulMasterTakeover: promotion failed. Will set 127.0.0.1:10111 as read_write
+INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook 1 of 1: echo 'Planned takeover failed for 127.0.0.1:10111'
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/run b/tests/system/graceful-master-takeover-fail-replication-stopped/05-check-logs/run
@@ -0,0 +1,3 @@
+# Read the logs generated after the test started, and check for the expected messages
+tail -n +$(cat /tmp/orchestrator.log.lines) /var/log/journal/orchestrator.service.log \
+  | grep -oP '(ERROR GracefulMasterTakeover: promotion failed|INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook).*'
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/README.md b/tests/system/graceful-master-takeover-fail-replication-stopped/README.md
@@ -0,0 +1,19 @@
+# Test the error handling when the master fails to take over gracefully and replication is stopped
+
+This error occurred in production when two takeovers were executed in a row.
+The first take over was successful, but the second one failed.
+
+This error is reproducable by stopping replication on the replica that
+is supposed to take over. Another way to reproduce this error is to
+run two takeovers one immediately after the other:
+
+```sh
+orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112
+orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10112 -d 127.0.0.1:10111
+```
+
+In the end the topology will be in a partially failed state, with
+replication stopped for replica `127.0.0.1:10112`, and the other two
+replicas placed behind it. Though, `master` will still be writable,
+and `PostUnsuccessfulGracefulTakeoverProcesses` hooks will be executed
+to help the cluster recover.
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/skip_run b/tests/system/graceful-master-takeover-fail-replication-stopped/skip_run
diff --git a/tests/system/graceful-master-takeover-fail-replication-stopped/teardown b/tests/system/graceful-master-takeover-fail-replication-stopped/teardown
@@ -0,0 +1,3 @@
+orchestrator-client -c start-replica -i 127.0.0.1:10112
+orchestrator-client -c relocate -i 127.0.0.1:10113 -d 127.0.0.1:10111
+orchestrator-client -c relocate -i 127.0.0.1:10114 -d 127.0.0.1:10111
diff --git a/tests/system/orchestrator-ci-system.conf.json b/tests/system/orchestrator-ci-system.conf.json
@@ -76,6 +76,9 @@
   "PostGracefulTakeoverProcesses": [
     "echo 'Planned takeover complete' >> /tmp/recovery.log"
   ],
+  "PostUnsuccessfulGracefulTakeoverProcesses": [
+    "echo 'Planned takeover failed for {failedHost}:{failedPort}'"
+  ],
   "CoMasterRecoveryMustPromoteOtherCoMaster": true,
   "DetachLostSlavesAfterMasterFailover": true,
   "ApplyMySQLPromotionAfterMasterFailover": true,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Store the current number of Orchestrator log lines
		wc -l </var/log/journal/orchestrator.service.log > /tmp/orchestrator.log.lines
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		orchestrator-client -c stop-replica -i 127.0.0.1:10112
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		WaitForExecBinlogCoordinatesToReach: reached maxWait 20s on 127.0.0.1:10112
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		sleep 3
		orchestrator-client -c graceful-master-takeover -i 127.0.0.1:10111 -d 127.0.0.1:10112
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		orchestrator-client -c topology-tabulated -alias ci \| cut -d'\|' -f 1,3,5
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ERROR GracefulMasterTakeover: promotion failed. Will set 127.0.0.1:10111 as read_write
		INFO topology_recovery: Running PostUnsuccessfulGracefulTakeoverProcesses hook 1 of 1: echo 'Planned takeover failed for 127.0.0.1:10111'