mongodb
diff --git a/‎.evergreen.yml
Lines changed: 4 additions & 3 deletions b/‎.evergreen.yml
Lines changed: 4 additions & 3 deletions
diff --git a/‎controllers/om/agent.go
Lines changed: 3 additions & 3 deletions b/‎controllers/om/agent.go
Lines changed: 3 additions & 3 deletions
diff --git a/‎controllers/om/deployment.go
Lines changed: 1 addition & 0 deletions b/‎controllers/om/deployment.go
Lines changed: 1 addition & 0 deletions
diff --git a/‎controllers/om/mockedomclient.go
Lines changed: 10 additions & 2 deletions b/‎controllers/om/mockedomclient.go
Lines changed: 10 additions & 2 deletions
diff --git a/‎controllers/om/omclient.go
Lines changed: 1 addition & 1 deletion b/‎controllers/om/omclient.go
Lines changed: 1 addition & 1 deletion
diff --git a/‎controllers/om/replicaset/om_replicaset.go
Lines changed: 2 additions & 2 deletions b/‎controllers/om/replicaset/om_replicaset.go
Lines changed: 2 additions & 2 deletions
diff --git a/‎controllers/operator/agents/agents.go
Lines changed: 184 additions & 41 deletions b/‎controllers/operator/agents/agents.go
Lines changed: 184 additions & 41 deletions
@@ -790,8 +790,7 @@ task_groups:
       - e2e_multi_cluster_sharded_simplest_no_mesh
       - e2e_multi_cluster_sharded_tls_no_mesh
       - e2e_multi_cluster_sharded_tls
-      # To re-activate as part of https://jira.mongodb.org/browse/CLOUDP-288588
-      #- e2e_multi_cluster_sharded_disaster_recovery
+      - e2e_multi_cluster_sharded_disaster_recovery
       - e2e_sharded_cluster
       - e2e_sharded_cluster_agent_flags
       - e2e_sharded_cluster_custom_podspec
@@ -869,6 +868,7 @@ task_groups:
       - e2e_om_update_before_reconciliation
       - e2e_om_feature_controls
       - e2e_multi_cluster_appdb_state_operator_upgrade_downgrade
+      - e2e_multi_cluster_sharded_disaster_recovery
     # disabled tests:
     #    - e2e_om_multiple # multi-cluster failures in EVG
     #    - e2e_om_appdb_scale_up_down # test not "reused" for multi-cluster appdb
@@ -921,14 +921,15 @@ task_groups:
       - e2e_multi_cluster_appdb_state_operator_upgrade_downgrade
       - e2e_om_update_before_reconciliation
       - e2e_om_feature_controls
+      - e2e_multi_cluster_sharded_disaster_recovery
     <<: *teardown_group
 
   # Dedicated task group for deploying OM Multi-Cluster when the operator is in the central cluster
   # that is not in the mesh
   - name: e2e_multi_cluster_om_operator_not_in_mesh_task_group
     max_hosts: -1
     <<: *setup_group_multi_cluster
-    <<: *setup_and_teardown_task_cloudqa
+    <<: *setup_and_teardown_task
     tasks:
       - e2e_multi_cluster_om_clusterwide_operator_not_in_mesh_networking
     <<: *teardown_group
 
@@ -9,7 +9,7 @@ import (
 
 // Checks if the agents have registered.
 
-type automationAgentStatusResponse struct {
+type AutomationAgentStatusResponse struct {
 	OMPaginated
 	AutomationAgents []AgentStatus `json:"results"`
 }
@@ -22,7 +22,7 @@ type AgentStatus struct {
 	TypeName  string `json:"typeName"`
 }
 
-var _ Paginated = automationAgentStatusResponse{}
+var _ Paginated = AutomationAgentStatusResponse{}
 
 // IsRegistered will return true if this given agent has `hostname_prefix` as a
 // prefix. This is needed to check if the given agent has registered.
@@ -48,7 +48,7 @@ func (agent AgentStatus) IsRegistered(hostnamePrefix string, log *zap.SugaredLog
 }
 
 // Results are needed to fulfil the Paginated interface
-func (aar automationAgentStatusResponse) Results() []interface{} {
+func (aar AutomationAgentStatusResponse) Results() []interface{} {
 	ans := make([]interface{}, len(aar.AutomationAgents))
 	for i, aa := range aar.AutomationAgents {
 		ans[i] = aa
 
@@ -921,6 +921,7 @@ func (d Deployment) removeProcesses(processNames []string, log *zap.SugaredLogge
 		for _, p2 := range processNames {
 			if p.Name() == p2 {
 				found = true
+				break
 			}
 		}
 		if !found {
 
@@ -68,6 +68,9 @@ type MockedOmConnection struct {
 	hostResults      *host.Result
 	agentHostnameMap map[string]struct{}
 
+	ReadAutomationStatusFunc func() (*AutomationStatus, error)
+	ReadAutomationAgentsFunc func(int) (Paginated, error)
+
 	numRequestsSent         int
 	AgentAPIKey             string
 	OrganizationsWithGroups map[*Organization][]*Project
@@ -449,6 +452,9 @@ func (oc *MockedOmConnection) GenerateAgentKey() (string, error) {
 
 func (oc *MockedOmConnection) ReadAutomationStatus() (*AutomationStatus, error) {
 	oc.addToHistory(reflect.ValueOf(oc.ReadAutomationStatus))
+	if oc.ReadAutomationStatusFunc != nil {
+		return oc.ReadAutomationStatusFunc()
+	}
 
 	if oc.AgentsDelayCount <= 0 {
 		// Emulating "agents reached goal state": returning the proper status for all the
@@ -462,15 +468,17 @@ func (oc *MockedOmConnection) ReadAutomationStatus() (*AutomationStatus, error)
 
 func (oc *MockedOmConnection) ReadAutomationAgents(pageNum int) (Paginated, error) {
 	oc.addToHistory(reflect.ValueOf(oc.ReadAutomationAgents))
+	if oc.ReadAutomationAgentsFunc != nil {
+		return oc.ReadAutomationAgentsFunc(pageNum)
+	}
 
 	results := make([]AgentStatus, 0)
 	for _, r := range oc.hostResults.Results {
 		results = append(results,
 			AgentStatus{Hostname: r.Hostname, LastConf: time.Now().Add(time.Second * -1).Format(time.RFC3339)})
 	}
 
-	// todo extend this for real testing
-	return automationAgentStatusResponse{AutomationAgents: results}, nil
+	return AutomationAgentStatusResponse{AutomationAgents: results}, nil
 }
 
 func (oc *MockedOmConnection) GetHosts() (*host.Result, error) {
 
@@ -512,7 +512,7 @@ func (oc *HTTPOmConnection) ReadAutomationAgents(pageNum int) (Paginated, error)
 	if err != nil {
 		return nil, err
 	}
-	var resp automationAgentStatusResponse
+	var resp AutomationAgentStatusResponse
 	if err := json.Unmarshal(ans, &resp); err != nil {
 		return nil, err
 	}
 
@@ -36,7 +36,7 @@ func BuildFromStatefulSetWithReplicas(set appsv1.StatefulSet, dbSpec mdbv1.DbSpe
 // https://jira.mongodb.org/browse/HELP-3818?focusedCommentId=1548348 for more details)
 // Note, that we are skipping setting nodes as "disabled" (but the code is commented to be able to revert this if
 // needed)
-func PrepareScaleDownFromMap(omClient om.Connection, rsMembers map[string][]string, healthyProcessesToWaitForGoalState []string, log *zap.SugaredLogger) error {
+func PrepareScaleDownFromMap(omClient om.Connection, rsMembers map[string][]string, processesToWaitForGoalState []string, log *zap.SugaredLogger) error {
 	processes := make([]string, 0)
 	for _, v := range rsMembers {
 		processes = append(processes, v...)
@@ -59,7 +59,7 @@ func PrepareScaleDownFromMap(omClient om.Connection, rsMembers map[string][]stri
 			return xerrors.Errorf("unable to set votes, priority to 0 in Ops Manager, hosts: %v, err: %w", processes, err)
 		}
 
-		if err := om.WaitForReadyState(omClient, healthyProcessesToWaitForGoalState, false, log); err != nil {
+		if err := om.WaitForReadyState(omClient, processesToWaitForGoalState, false, log); err != nil {
 			return err
 		}
 
 
@@ -3,6 +3,9 @@ package agents
 import (
 	"context"
 	"fmt"
+	"maps"
+	"slices"
+	"time"
 
 	"go.uber.org/zap"
 	"golang.org/x/xerrors"
@@ -30,6 +33,8 @@ type retryParams struct {
 	retrials    int
 }
 
+const RollingChangeArgs = "RollingChangeArgs"
+
 // EnsureAgentKeySecretExists checks if the Secret with specified name (<groupId>-group-secret) exists, otherwise tries to
 // generate agent key using OM public API and create Secret containing this key. Generation of a key is expected to be
 // a rare operation as the group creation api generates agent key already (so the only possible situation is when the group
@@ -107,6 +112,184 @@ func getAgentRegisterError(errorMsg string) error {
 		"name ('cluster.local'): %s", errorMsg))
 }
 
+const StaleProcessDuration = time.Minute * 2
+
+// ProcessState represents the state of the mongodb process.
+// Most importantly it contains the information whether the node is down (precisely whether the agent running next to mongod is actively reporting pings to OM),
+// what is the last version of the automation config achieved and the step on which the agent is currently executing the plan.
+type ProcessState struct {
+	Hostname            string
+	LastAgentPing       time.Time
+	GoalVersionAchieved int
+	Plan                []string
+	ProcessName         string
+}
+
+// NewProcessState should be used to create new instances of ProcessState as it sets some reasonable default values.
+// As ProcessState is combining the data from two sources, we don't have any guarantees that we'll have the information about the given hostname
+// available from both sources, therefore we need to always assume some defaults.
+func NewProcessState(hostname string) ProcessState {
+	return ProcessState{
+		Hostname:            hostname,
+		LastAgentPing:       time.Time{},
+		GoalVersionAchieved: -1,
+		Plan:                nil,
+	}
+}
+
+// IsStale returns true if this process is considered down, i.e. last ping of the agent is later than 2 minutes ago
+// We use an in-the-middle value when considering the process to be down:
+//   - in waitForAgentsToRegister we use 1 min to consider the process "not registered"
+//   - Ops Manager is using 5 mins as a default for considering process as stale
+func (p ProcessState) IsStale() bool {
+	return p.LastAgentPing.Add(StaleProcessDuration).Before(time.Now())
+}
+
+// MongoDBClusterStateInOM represents the state of the whole deployment from the Ops Manager's perspective by combining singnals about the processes from two sources:
+//   - from om.Connection.ReadAutomationAgents to get last ping of the agent (/groups/<groupId>/agents/AUTOMATION)
+//   - from om.Connection.ReadAutomationStatus to get the list of agent health statuses, AC version achieved, step of the agent's plan (/groups/<groupId>/automationStatus)
+type MongoDBClusterStateInOM struct {
+	GoalVersion     int
+	ProcessStateMap map[string]ProcessState
+}
+
+// GetMongoDBClusterState executes requests to OM from the given omConnection to gather the current deployment state.
+// It combines the data from the automation status and the list of automation agents.
+func GetMongoDBClusterState(omConnection om.Connection) (MongoDBClusterStateInOM, error) {
+	var agentStatuses []om.AgentStatus
+	_, err := om.TraversePages(
+		omConnection.ReadAutomationAgents,
+		func(aa interface{}) bool {
+			agentStatuses = append(agentStatuses, aa.(om.AgentStatus))
+			return false
+		},
+	)
+	if err != nil {
+		return MongoDBClusterStateInOM{}, xerrors.Errorf("error when reading automation agent pages: %v", err)
+	}
+
+	automationStatus, err := omConnection.ReadAutomationStatus()
+	if err != nil {
+		return MongoDBClusterStateInOM{}, xerrors.Errorf("error reading automation status: %v", err)
+	}
+
+	processStateMap, err := calculateProcessStateMap(automationStatus.Processes, agentStatuses)
+	if err != nil {
+		return MongoDBClusterStateInOM{}, err
+	}
+
+	return MongoDBClusterStateInOM{
+		GoalVersion:     automationStatus.GoalVersion,
+		ProcessStateMap: processStateMap,
+	}, nil
+}
+
+func (c *MongoDBClusterStateInOM) GetProcessState(hostname string) ProcessState {
+	if processState, ok := c.ProcessStateMap[hostname]; ok {
+		return processState
+	}
+
+	return NewProcessState(hostname)
+}
+
+func (c *MongoDBClusterStateInOM) GetProcesses() []ProcessState {
+	return slices.Collect(maps.Values(c.ProcessStateMap))
+}
+
+func (c *MongoDBClusterStateInOM) GetProcessesNotInGoalState() []ProcessState {
+	return slices.DeleteFunc(slices.Collect(maps.Values(c.ProcessStateMap)), func(processState ProcessState) bool {
+		return processState.GoalVersionAchieved >= c.GoalVersion
+	})
+}
+
+// calculateProcessStateMap combines information from ProcessStatuses and AgentStatuses returned by OpsManager
+// and maps them to a unified data structure.
+//
+// The resulting ProcessState combines information from both agent and process status when refer to the same hostname.
+// It is not guaranteed that we'll have the information from two sources, so in case one side is missing the defaults
+// would be present as defined in NewProcessState.
+// If multiple statuses exist for the same hostname, subsequent entries overwrite ones.
+// Fields such as GoalVersionAchieved default to -1 if never set, and Plan defaults to nil.
+// LastAgentPing defaults to the zero time if no AgentStatus entry is available.
+func calculateProcessStateMap(processStatuses []om.ProcessStatus, agentStatuses []om.AgentStatus) (map[string]ProcessState, error) {
+	processStates := map[string]ProcessState{}
+	for _, agentStatus := range agentStatuses {
+		if agentStatus.TypeName != "AUTOMATION" {
+			return nil, xerrors.Errorf("encountered unexpected agent type in agent status type in %+v", agentStatus)
+		}
+		processState, ok := processStates[agentStatus.Hostname]
+		if !ok {
+			processState = NewProcessState(agentStatus.Hostname)
+		}
+		lastPing, err := time.Parse(time.RFC3339, agentStatus.LastConf)
+		if err != nil {
+			return nil, xerrors.Errorf("wrong format for lastConf field: expected UTC format but the value is %s, agentStatus=%+v: %v", agentStatus.LastConf, agentStatus, err)
+		}
+		processState.LastAgentPing = lastPing
+
+		processStates[agentStatus.Hostname] = processState
+	}
+
+	for _, processStatus := range processStatuses {
+		processState, ok := processStates[processStatus.Hostname]
+		if !ok {
+			processState = NewProcessState(processStatus.Hostname)
+		}
+		processState.GoalVersionAchieved = processStatus.LastGoalVersionAchieved
+		processState.ProcessName = processStatus.Name
+		processState.Plan = processStatus.Plan
+		processStates[processStatus.Hostname] = processState
+	}
+
+	return processStates, nil
+}
+
+func agentCheck(omConnection om.Connection, agentHostnames []string, log *zap.SugaredLogger) (string, bool) {
+	registeredHostnamesSet := map[string]struct{}{}
+	predicateFunc := func(aa interface{}) bool {
+		automationAgent := aa.(om.Status)
+		for _, hostname := range agentHostnames {
+			if automationAgent.IsRegistered(hostname, log) {
+				registeredHostnamesSet[hostname] = struct{}{}
+				if len(registeredHostnamesSet) == len(agentHostnames) {
+					return true
+				}
+			}
+		}
+		return false
+	}
+
+	_, err := om.TraversePages(
+		omConnection.ReadAutomationAgents,
+		predicateFunc,
+	)
+	if err != nil {
+		return fmt.Sprintf("Received error when reading automation agent pages: %v", err), false
+	}
+
+	// convert to list of keys only for pretty printing in the error message
+	var registeredHostnamesList []string
+	for hostname := range registeredHostnamesSet {
+		registeredHostnamesList = append(registeredHostnamesList, hostname)
+	}
+
+	var msg string
+	if len(registeredHostnamesList) == 0 {
+		return fmt.Sprintf("None of %d expected agents has registered with OM, expected hostnames: %+v", len(agentHostnames), agentHostnames), false
+	} else if len(registeredHostnamesList) == len(agentHostnames) {
+		return fmt.Sprintf("All of %d expected agents have registered with OM, hostnames: %+v", len(registeredHostnamesList), registeredHostnamesList), true
+	} else {
+		var missingHostnames []string
+		for _, expectedHostname := range agentHostnames {
+			if _, ok := registeredHostnamesSet[expectedHostname]; !ok {
+				missingHostnames = append(missingHostnames, expectedHostname)
+			}
+		}
+		msg = fmt.Sprintf("Only %d of %d expected agents have registered with OM, missing hostnames: %+v, registered hostnames in OM: %+v, expected hostnames: %+v", len(registeredHostnamesList), len(agentHostnames), missingHostnames, registeredHostnamesList, agentHostnames)
+		return msg, false
+	}
+}
+
 // waitUntilRegistered waits until all agents with 'agentHostnames' are registered in OM. Note, that wait
 // happens after retrial - this allows to skip waiting in case agents are already registered
 func waitUntilRegistered(omConnection om.Connection, log *zap.SugaredLogger, r retryParams, agentHostnames ...string) (bool, string) {
@@ -120,47 +303,7 @@ func waitUntilRegistered(omConnection om.Connection, log *zap.SugaredLogger, r r
 	retrials := env.ReadIntOrDefault(util.PodWaitRetriesEnv, r.retrials)
 
 	agentsCheckFunc := func() (string, bool) {
-		registeredHostnamesMap := map[string]struct{}{}
-		_, err := om.TraversePages(
-			omConnection.ReadAutomationAgents,
-			func(aa interface{}) bool {
-				automationAgent := aa.(om.Status)
-				for _, hostname := range agentHostnames {
-					if automationAgent.IsRegistered(hostname, log) {
-						registeredHostnamesMap[hostname] = struct{}{}
-						if len(registeredHostnamesMap) == len(agentHostnames) {
-							return true
-						}
-					}
-				}
-				return false
-			},
-		)
-		if err != nil {
-			log.Errorw("Received error when reading automation agent pages", "err", err)
-		}
-
-		// convert to list of keys only for pretty printing in the error message
-		var registeredHostnamesList []string
-		for hostname := range registeredHostnamesMap {
-			registeredHostnamesList = append(registeredHostnamesList, hostname)
-		}
-
-		var msg string
-		if len(registeredHostnamesList) == 0 {
-			return fmt.Sprintf("None of %d expected agents has registered with OM, expected hostnames: %+v", len(agentHostnames), agentHostnames), false
-		} else if len(registeredHostnamesList) == len(agentHostnames) {
-			return fmt.Sprintf("All of %d expected agents have registered with OM, hostnames: %+v", len(registeredHostnamesList), registeredHostnamesList), true
-		} else {
-			var missingHostnames []string
-			for _, expectedHostname := range agentHostnames {
-				if _, ok := registeredHostnamesMap[expectedHostname]; !ok {
-					missingHostnames = append(missingHostnames, expectedHostname)
-				}
-			}
-			msg = fmt.Sprintf("Only %d of %d expected agents have registered with OM, missing hostnames: %+v, registered hostnames in OM: %+v, expected hostnames: %+v", len(registeredHostnamesList), len(agentHostnames), missingHostnames, registeredHostnamesList, agentHostnames)
-			return msg, false
-		}
+		return agentCheck(omConnection, agentHostnames, log)
 	}
 
 	return util.DoAndRetry(agentsCheckFunc, log, retrials, waitSeconds)
Original file line number	Diff line number	Diff line change
`@@ -921,6 +921,7 @@ func (d Deployment) removeProcesses(processNames []string, log *zap.SugaredLogge`
`921`	`921`	`for _, p2 := range processNames {`
`922`	`922`	`if p.Name() == p2 {`
`923`	`923`	`found = true`
	`924`	`+ break`
`924`	`925`	`}`
`925`	`926`	`}`
`926`	`927`	`if !found {`
Original file line number	Diff line number	Diff line change
`@@ -512,7 +512,7 @@ func (oc *HTTPOmConnection) ReadAutomationAgents(pageNum int) (Paginated, error)`
`512`	`512`	`if err != nil {`
`513`	`513`	`return nil, err`
`514`	`514`	`}`
`515`		`- var resp automationAgentStatusResponse`
	`515`	`+ var resp AutomationAgentStatusResponse`
`516`	`516`	`if err := json.Unmarshal(ans, &resp); err != nil {`
`517`	`517`	`return nil, err`
`518`	`518`	`}`