From 4c80b61a8afb4d8d2757a1eda1056f90260e0622 Mon Sep 17 00:00:00 2001 From: Suresh Kumar Anaparti Date: Tue, 10 Jun 2025 12:51:27 +0530 Subject: [PATCH 1/7] Management Server - Prepare for Maintenance and Cancel Maintenance improvements: - Added new setting 'management.server.maintenance.ignore.maintenance.hosts' to ignore hosts in maintenance states while preparing management server for maintenance. This skips agent transfer and agents count check for hosts in maintenance. - Rebalance indirect agents after cancel maintenance, using rebalance parameter in cancelMaintenance API - Force maintenance after maintenance window timeout, using forced parameter in prepareForMaintenance API. - Propagate 'indirect.agent.lb.check.interval' setting change to the host agents. --- .../src/main/java/com/cloud/agent/Agent.java | 14 ++- .../exception/OperationTimedoutException.java | 2 +- .../com/cloud/resource/ResourceState.java | 7 +- .../apache/cloudstack/api/ApiConstants.java | 1 + .../agent/lb/SetupMSListCommand.java | 9 +- .../java/com/cloud/agent/AgentManager.java | 2 +- .../cloud/agent/manager/AgentManagerImpl.java | 2 +- .../manager/ClusteredAgentManagerImpl.java | 29 +++-- .../main/java/com/cloud/host/dao/HostDao.java | 18 ++- .../java/com/cloud/host/dao/HostDaoImpl.java | 42 ++++++- .../cloudstack/agent/lb/IndirectAgentLB.java | 8 +- .../api/command/CancelMaintenanceCmd.java | 10 ++ .../api/command/PrepareForMaintenanceCmd.java | 8 ++ .../ManagementServerMaintenanceManager.java | 11 +- ...anagementServerMaintenanceManagerImpl.java | 59 +++++++--- ...aintenanceManagementServerHostCommand.java | 10 ++ ...aintenanceManagementServerHostCommand.java | 8 +- ...ementServerMaintenanceManagerImplTest.java | 16 +-- .../ConfigurationManagerImpl.java | 31 ++++-- .../agent/lb/IndirectAgentLBServiceImpl.java | 104 +++++++++++++----- .../lb/IndirectAgentLBServiceImplTest.java | 4 +- ui/public/locales/en.json | 1 + .../config/section/infra/managementServers.js | 1 + ui/src/views/infra/Confirmation.vue | 7 ++ .../com/cloud/utils/nio/NioConnection.java | 2 +- 25 files changed, 310 insertions(+), 96 deletions(-) diff --git a/agent/src/main/java/com/cloud/agent/Agent.java b/agent/src/main/java/com/cloud/agent/Agent.java index fcd4234a1361..264480606c68 100644 --- a/agent/src/main/java/com/cloud/agent/Agent.java +++ b/agent/src/main/java/com/cloud/agent/Agent.java @@ -928,7 +928,7 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) { return new SetupCertificateAnswer(true); } - private void processManagementServerList(final List msList, final List avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) { + private void processManagementServerList(final List msList, final List avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final boolean triggerHostLB) { if (CollectionUtils.isNotEmpty(msList) && StringUtils.isNotEmpty(lbAlgorithm)) { try { final String newMSHosts = String.format("%s%s%s", com.cloud.utils.StringUtils.toCSVList(msList), IAgentShell.hostLbAlgorithmSeparator, lbAlgorithm); @@ -941,6 +941,12 @@ private void processManagementServerList(final List msList, final List msList, final List { migrateAgentConnection(cmd.getAvoidMsList()); @@ -1046,7 +1052,7 @@ public void processReadyCommand(final Command cmd) { } verifyAgentArch(ready.getArch()); - processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval()); + processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval(), false); logger.info("Ready command is processed for agent [id: {}, uuid: {}, name: {}]", getId(), getUuid(), getName()); } diff --git a/api/src/main/java/com/cloud/exception/OperationTimedoutException.java b/api/src/main/java/com/cloud/exception/OperationTimedoutException.java index fe27408eb4e3..66b607100d97 100644 --- a/api/src/main/java/com/cloud/exception/OperationTimedoutException.java +++ b/api/src/main/java/com/cloud/exception/OperationTimedoutException.java @@ -40,7 +40,7 @@ public class OperationTimedoutException extends CloudException { boolean _isActive; public OperationTimedoutException(Command[] cmds, long agentId, long seqId, int time, boolean isActive) { - super("Commands " + seqId + " to Host " + agentId + " timed out after " + time); + super("Commands " + seqId + " to Host " + agentId + " timed out after " + time + " secs"); _agentId = agentId; _seqId = seqId; _time = time; diff --git a/api/src/main/java/com/cloud/resource/ResourceState.java b/api/src/main/java/com/cloud/resource/ResourceState.java index 70738c7921bc..e91cf820b081 100644 --- a/api/src/main/java/com/cloud/resource/ResourceState.java +++ b/api/src/main/java/com/cloud/resource/ResourceState.java @@ -76,6 +76,10 @@ public static Event toEvent(String e) { } } + public static List s_maintenanceStates = List.of(ResourceState.Maintenance, + ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance, + ResourceState.ErrorInPrepareForMaintenance); + public ResourceState getNextState(Event a) { return s_fsm.getNextState(this, a); } @@ -98,8 +102,7 @@ public static String[] toString(ResourceState... states) { } public static boolean isMaintenanceState(ResourceState state) { - return Arrays.asList(ResourceState.Maintenance, ResourceState.ErrorInMaintenance, - ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance).contains(state); + return s_maintenanceStates.contains(state); } public static boolean canAttemptMaintenance(ResourceState state) { diff --git a/api/src/main/java/org/apache/cloudstack/api/ApiConstants.java b/api/src/main/java/org/apache/cloudstack/api/ApiConstants.java index 304e43009f26..36929e74a54d 100644 --- a/api/src/main/java/org/apache/cloudstack/api/ApiConstants.java +++ b/api/src/main/java/org/apache/cloudstack/api/ApiConstants.java @@ -432,6 +432,7 @@ public class ApiConstants { public static final String PUBLIC_END_PORT = "publicendport"; public static final String PUBLIC_ZONE = "publiczone"; public static final String PURGE_RESOURCES = "purgeresources"; + public static final String REBALANCE = "rebalance"; public static final String RECEIVED_BYTES = "receivedbytes"; public static final String RECONNECT = "reconnect"; public static final String RECOVER = "recover"; diff --git a/core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java b/core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java index 32f436434c17..864a3e22eb3e 100644 --- a/core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java +++ b/core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java @@ -29,13 +29,15 @@ public class SetupMSListCommand extends Command { private List avoidMsList; private String lbAlgorithm; private Long lbCheckInterval; + private Boolean triggerHostLb; - public SetupMSListCommand(final List msList, final List avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) { + public SetupMSListCommand(final List msList, final List avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final Boolean triggerHostLb) { super(); this.msList = msList; this.avoidMsList = avoidMsList; this.lbAlgorithm = lbAlgorithm; this.lbCheckInterval = lbCheckInterval; + this.triggerHostLb = triggerHostLb; } public List getMsList() { @@ -54,9 +56,12 @@ public Long getLbCheckInterval() { return lbCheckInterval; } + public boolean getTriggerHostLb() { + return triggerHostLb; + } + @Override public boolean executeInSequence() { return false; } - } diff --git a/engine/components-api/src/main/java/com/cloud/agent/AgentManager.java b/engine/components-api/src/main/java/com/cloud/agent/AgentManager.java index dd388d2a2d8a..0aa5805b1601 100644 --- a/engine/components-api/src/main/java/com/cloud/agent/AgentManager.java +++ b/engine/components-api/src/main/java/com/cloud/agent/AgentManager.java @@ -171,5 +171,5 @@ enum TapAgentsAction { void propagateChangeToAgents(Map params); - boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs); + boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance); } diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java index 585c479f65f8..fb17bf681ad1 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java @@ -2193,7 +2193,7 @@ public void propagateChangeToAgents(Map params) { } @Override - public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) { + public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) { return true; } diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java index 8795c8d428fd..e72736a972ba 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java @@ -42,6 +42,7 @@ import javax.net.ssl.SSLContext; import javax.net.ssl.SSLEngine; +import com.cloud.resource.ResourceState; import org.apache.cloudstack.ca.CAManager; import org.apache.cloudstack.framework.config.ConfigDepot; import org.apache.cloudstack.framework.config.ConfigKey; @@ -431,10 +432,10 @@ public boolean routeToPeer(final String peer, final byte[] bytes) { ch = connectToPeer(peer, ch); if (ch == null) { try { - logD(bytes, "Unable to route to peer: " + Request.parse(bytes)); + logD(bytes, "Unable to establish connection to route to peer: " + Request.parse(bytes)); } catch (ClassNotFoundException | UnsupportedVersionException e) { // Request.parse thrown exception when we try to log it, log as much as we can - logD(bytes, "Unable to route to peer, and Request.parse further caught exception" + e.getMessage()); + logD(bytes, "Unable to establish connection to route to peer, and Request.parse further caught exception" + e.getMessage()); } return false; } @@ -643,7 +644,6 @@ protected void doTask(final Task task) throws TaskExecutionException { final Link link = task.getLink(); if (Request.fromServer(data)) { - final AgentAttache agent = findAttache(hostId); if (Request.isControl(data)) { @@ -691,7 +691,6 @@ protected void doTask(final Task task) throws TaskExecutionException { cancel(Long.toString(Request.getManagementServerId(data)), hostId, Request.getSequence(data), e.getMessage()); } } else { - final long mgmtId = Request.getManagementServerId(data); if (mgmtId != -1 && mgmtId != _nodeId) { routeToPeer(Long.toString(mgmtId), data); @@ -1352,7 +1351,7 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS if (cmd instanceof PrepareForMaintenanceManagementServerHostCommand) { logger.debug("Received PrepareForMaintenanceManagementServerHostCommand - preparing for maintenance"); try { - managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm()); + managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm(), ((PrepareForMaintenanceManagementServerHostCommand) cmd).isForced()); return "Successfully prepared for maintenance"; } catch(CloudRuntimeException e) { return e.getMessage(); @@ -1399,14 +1398,14 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS } @Override - public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) { + public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) { if (timeoutDurationInMs <= 0) { logger.debug("Not transferring direct agents from management server node {} (id: {}) to other nodes, invalid timeout duration", fromMsId, fromMsUuid); return false; } long transferStartTimeInMs = System.currentTimeMillis(); - if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId))) { + if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId, excludeHostsInMaintenance))) { logger.info("No direct agent hosts available on management server node {} (id: {}), to transfer", fromMsId, fromMsUuid); return true; } @@ -1421,7 +1420,7 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long int agentTransferFailedCount = 0; List dataCenterList = dcDao.listAll(); for (DataCenterVO dc : dataCenterList) { - List directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId()); + List directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId(), excludeHostsInMaintenance); if (CollectionUtils.isEmpty(directAgentHostsInDc)) { continue; } @@ -1455,9 +1454,9 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long return (agentTransferFailedCount == 0); } - private List getDirectAgentHosts(long msId) { + private List getDirectAgentHosts(long msId, boolean excludeHostsInMaintenance) { List directAgentHosts = new ArrayList<>(); - List hosts = _hostDao.listHostsByMs(msId); + List hosts = _hostDao.listHostsByMsResourceState(msId, null); for (HostVO host : hosts) { AgentAttache agent = findAttache(host.getId()); if (agent instanceof DirectAgentAttache) { @@ -1468,9 +1467,11 @@ private List getDirectAgentHosts(long msId) { return directAgentHosts; } - private List getDirectAgentHostsInDc(long msId, long dcId) { + private List getDirectAgentHostsInDc(long msId, long dcId, boolean excludeHostsInMaintenance) { List directAgentHosts = new ArrayList<>(); - List hosts = _hostDao.listHostsByMsAndDc(msId, dcId); + // To exclude maintenance states use values from ResourceState as source of truth + List statesToExclude = excludeHostsInMaintenance ? ResourceState.s_maintenanceStates : List.of(); + List hosts = _hostDao.listHostsByMsDcResourceState(msId, dcId, statesToExclude); for (HostVO host : hosts) { AgentAttache agent = findAttache(host.getId()); if (agent instanceof DirectAgentAttache) { @@ -1506,6 +1507,10 @@ public void onManagementServerPreparingForMaintenance() { public void onManagementServerCancelPreparingForMaintenance() { logger.debug("Management server cancel preparing for maintenance"); super.onManagementServerPreparingForMaintenance(); + + // needed for the case when Management Server in Preparing For Maintenance but didn't go to Maintenance state + // (where this variable will be reset) + _agentLbHappened = false; } @Override diff --git a/engine/schema/src/main/java/com/cloud/host/dao/HostDao.java b/engine/schema/src/main/java/com/cloud/host/dao/HostDao.java index 2b8a23a1b510..090b019334f4 100644 --- a/engine/schema/src/main/java/com/cloud/host/dao/HostDao.java +++ b/engine/schema/src/main/java/com/cloud/host/dao/HostDao.java @@ -177,14 +177,24 @@ public interface HostDao extends GenericDao, StateDao listHostsByMsAndDc(long msId, long dcId); + List listHostsByMsDcResourceState(long msId, long dcId, List excludedResourceStates); + List listHostsByMs(long msId); + List listHostsByMsResourceState(long msId, List excludedResourceStates); + /** - * Retrieves the number of hosts/agents this {@see ManagementServer} has responsibility over. - * @param msId the id of the {@see ManagementServer} - * @return the number of hosts/agents this {@see ManagementServer} has responsibility over + * Count Hosts by given Management Server, Host and Hypervisor Types, + * and exclude Hosts with given Resource States. + * + * @param msId Management Server Id + * @param excludedResourceStates Resource States to be excluded + * @param hostTypes Host Types + * @param hypervisorTypes Hypervisor Types + * @return Hosts count */ - int countByMs(long msId); + int countHostsByMsResourceStateTypeAndHypervisorType(long msId, List excludedResourceStates, + List hostTypes, List hypervisorTypes); /** * Retrieves the host ids/agents this {@see ManagementServer} has responsibility over. diff --git a/engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java b/engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java index 61fa3edcf227..8f218841b074 100644 --- a/engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java +++ b/engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java @@ -72,6 +72,7 @@ import com.cloud.utils.db.GenericSearchBuilder; import com.cloud.utils.db.JoinBuilder; import com.cloud.utils.db.JoinBuilder.JoinType; +import com.cloud.utils.db.QueryBuilder; import com.cloud.utils.db.SearchBuilder; import com.cloud.utils.db.SearchCriteria; import com.cloud.utils.db.SearchCriteria.Func; @@ -1600,6 +1601,17 @@ public List listHostsByMsAndDc(long msId, long dcId) { return listBy(sc); } + @Override + public List listHostsByMsDcResourceState(long msId, long dcId, List excludedResourceStates) { + QueryBuilder sc = QueryBuilder.create(HostVO.class); + sc.and(sc.entity().getManagementServerId(), Op.EQ, msId); + sc.and(sc.entity().getDataCenterId(), Op.EQ, dcId); + if (CollectionUtils.isNotEmpty(excludedResourceStates)) { + sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray()); + } + return listBy(sc.create()); + } + @Override public List listHostsByMs(long msId) { SearchCriteria sc = ResponsibleMsSearch.create(); @@ -1608,10 +1620,32 @@ public List listHostsByMs(long msId) { } @Override - public int countByMs(long msId) { - SearchCriteria sc = ResponsibleMsSearch.create(); - sc.setParameters("managementServerId", msId); - return getCount(sc); + public List listHostsByMsResourceState(long msId, List excludedResourceStates) { + QueryBuilder sc = QueryBuilder.create(HostVO.class); + sc.and(sc.entity().getManagementServerId(), Op.EQ, msId); + if (CollectionUtils.isNotEmpty(excludedResourceStates)) { + sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray()); + } + return listBy(sc.create()); + } + + @Override + public int countHostsByMsResourceStateTypeAndHypervisorType(long msId, + List excludedResourceStates, + List hostTypes, + List hypervisorTypes) { + QueryBuilder sc = QueryBuilder.create(HostVO.class); + sc.and(sc.entity().getManagementServerId(), Op.EQ, msId); + if (CollectionUtils.isNotEmpty(excludedResourceStates)) { + sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray()); + } + if (CollectionUtils.isNotEmpty(hostTypes)) { + sc.and(sc.entity().getType(), Op.IN, hostTypes.toArray()); + } + if (CollectionUtils.isNotEmpty(hypervisorTypes)) { + sc.and(sc.entity().getHypervisorType(), Op.IN, hypervisorTypes.toArray()); + } + return getCount(sc.create()); } @Override diff --git a/framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java b/framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java index b136b8e842b8..780a09b883e0 100644 --- a/framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java +++ b/framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java @@ -70,9 +70,11 @@ public interface IndirectAgentLB { */ Long getLBPreferredHostCheckInterval(Long clusterId); - void propagateMSListToAgents(); + void propagateMSListToAgents(boolean triggerHostLB); - boolean haveAgentBasedHosts(long msId); + void propagateMSListToAgentsInCluster(Long clusterId); - boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs); + boolean haveAgentBasedHosts(long msId, boolean excludeHostsInMaintenance); + + boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs, boolean excludeHostsInMaintenance); } diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/CancelMaintenanceCmd.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/CancelMaintenanceCmd.java index a0f091ef1e4a..918d40137352 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/CancelMaintenanceCmd.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/CancelMaintenanceCmd.java @@ -18,12 +18,15 @@ package org.apache.cloudstack.api.command; import org.apache.cloudstack.api.APICommand; +import org.apache.cloudstack.api.ApiConstants; import org.apache.cloudstack.api.BaseCmd; import com.cloud.user.Account; +import org.apache.cloudstack.api.Parameter; import org.apache.cloudstack.api.response.ManagementServerMaintenanceResponse; import org.apache.cloudstack.acl.RoleType; +import org.apache.commons.lang3.BooleanUtils; @APICommand(name = CancelMaintenanceCmd.APINAME, description = "Cancels maintenance of the management server", @@ -36,6 +39,13 @@ public class CancelMaintenanceCmd extends BaseMSMaintenanceActionCmd { public static final String APINAME = "cancelMaintenance"; + @Parameter(name = ApiConstants.REBALANCE, type = CommandType.BOOLEAN, description = "Rebalance agents (applicable for indirect agents) after cancelling maintenance, default is true") + private Boolean rebalance; + + public boolean getRebalance() { + return BooleanUtils.toBooleanDefaultIfNull(rebalance, true); + } + @Override public String getCommandName() { return APINAME.toLowerCase() + BaseCmd.RESPONSE_SUFFIX; diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/PrepareForMaintenanceCmd.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/PrepareForMaintenanceCmd.java index 3c036c4c35f2..2b63b28e0c5b 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/PrepareForMaintenanceCmd.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/PrepareForMaintenanceCmd.java @@ -26,6 +26,7 @@ import org.apache.cloudstack.api.response.ManagementServerMaintenanceResponse; import org.apache.cloudstack.acl.RoleType; +import org.apache.commons.lang3.BooleanUtils; @APICommand(name = PrepareForMaintenanceCmd.APINAME, description = "Prepares management server for maintenance by preventing new jobs from being accepted after completion of active jobs and migrating the agents", @@ -40,6 +41,9 @@ public class PrepareForMaintenanceCmd extends BaseMSMaintenanceActionCmd { " when this is not set, already configured algorithm from setting 'indirect.agent.lb.algorithm' is considered") private String algorithm; + @Parameter(name = ApiConstants.FORCED, type = CommandType.BOOLEAN, description = "Force management server to maintenance after the maintenance window timeout, default is false") + private Boolean forced; + public String getAlgorithm() { return algorithm; } @@ -48,6 +52,10 @@ public void setAlgorithm(String algorithm) { this.algorithm = algorithm; } + public boolean isForced() { + return BooleanUtils.toBooleanDefaultIfNull(forced, false); + } + @Override public String getCommandName() { return APINAME.toLowerCase() + BaseCmd.RESPONSE_SUFFIX; diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManager.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManager.java index 3af19164cc93..40b6fcd6abe3 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManager.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManager.java @@ -40,6 +40,15 @@ public interface ManagementServerMaintenanceManager { ConfigKey.Scope.Global, null); + ConfigKey ManagementServerMaintenanceIgnoreMaintenanceHosts = new ConfigKey<>(Boolean.class, + "management.server.maintenance.ignore.maintenance.hosts", + "Advanced", + String.valueOf(Boolean.FALSE), + "Host in Maintenance state can sometimes block Management Server to go to Maintenance; this setting skips Host(s) in Maintenance state during Management Server Maintenance, default: false.", + true, + ConfigKey.Scope.Global, + null); + void registerListener(ManagementServerMaintenanceListener listener); void unregisterListener(ManagementServerMaintenanceListener listener); @@ -83,7 +92,7 @@ public interface ManagementServerMaintenanceManager { String getLbAlgorithm(); // Prepares the current management server for maintenance by migrating the agents and not accepting any more async jobs - void prepareForMaintenance(String lbAlorithm); + void prepareForMaintenance(String lbAlorithm, boolean forced); // Cancels maintenance of the current management server void cancelMaintenance(); diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImpl.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImpl.java index fcfa32d6ce88..c28c4dffe04d 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImpl.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImpl.java @@ -26,7 +26,9 @@ import javax.inject.Inject; +import com.cloud.resource.ResourceState; import org.apache.cloudstack.agent.lb.IndirectAgentLB; +import org.apache.cloudstack.agent.lb.IndirectAgentLBServiceImpl; import org.apache.cloudstack.api.command.CancelMaintenanceCmd; import org.apache.cloudstack.api.command.CancelShutdownCmd; import org.apache.cloudstack.api.command.PrepareForMaintenanceCmd; @@ -251,7 +253,7 @@ private void prepareForShutdown(boolean postTrigger) { this.preparingForShutdown = true; jobManager.disableAsyncJobs(); - waitForPendingJobs(); + waitForPendingJobs(false); } @Override @@ -273,7 +275,7 @@ public void cancelShutdown() { } @Override - public void prepareForMaintenance(String lbAlorithm) { + public void prepareForMaintenance(String lbAlorithm, boolean forced) { if (this.preparingForShutdown) { throw new CloudRuntimeException("Shutdown has already been triggered, cancel shutdown and try again"); } @@ -286,7 +288,7 @@ public void prepareForMaintenance(String lbAlorithm) { this.lbAlgorithm = lbAlorithm; jobManager.disableAsyncJobs(); onPreparingForMaintenance(); - waitForPendingJobs(); + waitForPendingJobs(forced); } @Override @@ -310,12 +312,13 @@ public void cancelMaintenance() { } } - private void waitForPendingJobs() { + private void waitForPendingJobs(boolean forceMaintenance) { cancelWaitForPendingJobs(); pendingJobsCheckTask = Executors.newScheduledThreadPool(1, new NamedThreadFactory("PendingJobsCheck")); long pendingJobsCheckDelayInSecs = 1L; // 1 sec long pendingJobsCheckPeriodInSecs = 3L; // every 3 secs, check more frequently for pending jobs - pendingJobsCheckTask.scheduleAtFixedRate(new CheckPendingJobsTask(this), pendingJobsCheckDelayInSecs, pendingJobsCheckPeriodInSecs, TimeUnit.SECONDS); + boolean ignoreMaintenanceHosts = ManagementServerMaintenanceIgnoreMaintenanceHosts.value(); + pendingJobsCheckTask.scheduleAtFixedRate(new CheckPendingJobsTask(this, ignoreMaintenanceHosts, forceMaintenance), pendingJobsCheckDelayInSecs, pendingJobsCheckPeriodInSecs, TimeUnit.SECONDS); } @Override @@ -426,7 +429,8 @@ public ManagementServerMaintenanceResponse prepareForMaintenance(PrepareForMaint checkAnyMsInPreparingStates("prepare for maintenance"); - if (indirectAgentLB.haveAgentBasedHosts(msHost.getMsid())) { + boolean ignoreMaintenanceHosts = ManagementServerMaintenanceIgnoreMaintenanceHosts.value(); + if (indirectAgentLB.haveAgentBasedHosts(msHost.getMsid(), ignoreMaintenanceHosts)) { List indirectAgentMsList = indirectAgentLB.getManagementServerList(); indirectAgentMsList.remove(msHost.getServiceIP()); List nonUpMsList = msHostDao.listNonUpStateMsIPs(); @@ -437,7 +441,7 @@ public ManagementServerMaintenanceResponse prepareForMaintenance(PrepareForMaint } final Command[] cmds = new Command[1]; - cmds[0] = new PrepareForMaintenanceManagementServerHostCommand(msHost.getMsid(), cmd.getAlgorithm()); + cmds[0] = new PrepareForMaintenanceManagementServerHostCommand(msHost.getMsid(), cmd.getAlgorithm(), cmd.isForced()); executeCmd(msHost, cmds); msHostDao.updateState(msHost.getId(), State.PreparingForMaintenance); @@ -457,10 +461,15 @@ public ManagementServerMaintenanceResponse cancelMaintenance(CancelMaintenanceCm } final Command[] cmds = new Command[1]; - cmds[0] = new CancelMaintenanceManagementServerHostCommand(msHost.getMsid()); + cmds[0] = new CancelMaintenanceManagementServerHostCommand(msHost.getMsid(), cmd.getRebalance()); executeCmd(msHost, cmds); msHostDao.updateState(msHost.getId(), State.Up); + + if (cmd.getRebalance()) { + logger.info("Propagate MS list and rebalance indirect agents"); + indirectAgentLB.propagateMSListToAgents(true); + } return prepareMaintenanceResponse(cmd.getManagementServerId()); } @@ -546,17 +555,21 @@ public String getConfigComponentName() { @Override public ConfigKey[] getConfigKeys() { return new ConfigKey[]{ - ManagementServerMaintenanceTimeoutInMins + ManagementServerMaintenanceTimeoutInMins, ManagementServerMaintenanceIgnoreMaintenanceHosts }; } private final class CheckPendingJobsTask extends ManagedContextRunnable { private ManagementServerMaintenanceManager managementServerMaintenanceManager; + private boolean ignoreMaintenanceHosts = false; private boolean agentsTransferTriggered = false; + private boolean forceMaintenance = false; - public CheckPendingJobsTask(ManagementServerMaintenanceManager managementServerMaintenanceManager) { + public CheckPendingJobsTask(ManagementServerMaintenanceManager managementServerMaintenanceManager, boolean ignoreMaintenanceHosts, boolean forceMaintenance) { this.managementServerMaintenanceManager = managementServerMaintenanceManager; + this.ignoreMaintenanceHosts = ignoreMaintenanceHosts; + this.forceMaintenance = forceMaintenance; } @Override @@ -570,6 +583,15 @@ protected void runInContext() { } if (managementServerMaintenanceManager.isPreparingForMaintenance() && isMaintenanceWindowExpired()) { + if (forceMaintenance) { + logger.debug("Maintenance window timeout, MS is forced to Maintenance Mode"); + ManagementServerHostVO msHost = msHostDao.findByMsid(ManagementServerNode.getManagementServerId()); + msHostDao.updateState(msHost.getId(), State.Maintenance); + managementServerMaintenanceManager.onMaintenance(); + managementServerMaintenanceManager.cancelWaitForPendingJobs(); + return; + } + logger.debug("Maintenance window timeout, terminating the pending jobs check timer task"); managementServerMaintenanceManager.cancelPreparingForMaintenance(null); managementServerMaintenanceManager.cancelWaitForPendingJobs(); @@ -577,7 +599,9 @@ protected void runInContext() { } long totalPendingJobs = managementServerMaintenanceManager.countPendingJobs(ManagementServerNode.getManagementServerId()); - int totalAgents = hostDao.countByMs(ManagementServerNode.getManagementServerId()); + + long totalAgents = totalAgentsInMs(); + String msg = String.format("Checking for triggered maintenance or shutdown... shutdownTriggered [%b] AllowAsyncJobs [%b] PendingJobCount [%d] AgentsCount [%d]", managementServerMaintenanceManager.isShutdownTriggered(), managementServerMaintenanceManager.isAsyncJobsEnabled(), totalPendingJobs, totalAgents); logger.debug(msg); @@ -609,7 +633,7 @@ protected void runInContext() { agentsTransferTriggered = true; logger.info(String.format("Preparing for maintenance - migrating agents from management server node %d (id: %s)", ManagementServerNode.getManagementServerId(), msHost.getUuid())); - boolean agentsMigrated = indirectAgentLB.migrateAgents(msHost.getUuid(), ManagementServerNode.getManagementServerId(), managementServerMaintenanceManager.getLbAlgorithm(), remainingMaintenanceWindowInMs()); + boolean agentsMigrated = indirectAgentLB.migrateAgents(msHost.getUuid(), ManagementServerNode.getManagementServerId(), managementServerMaintenanceManager.getLbAlgorithm(), remainingMaintenanceWindowInMs(), ignoreMaintenanceHosts); if (!agentsMigrated) { logger.warn(String.format("Unable to prepare for maintenance, cannot migrate indirect agents on this management server node %d (id: %s)", ManagementServerNode.getManagementServerId(), msHost.getUuid())); managementServerMaintenanceManager.cancelPreparingForMaintenance(msHost); @@ -617,7 +641,7 @@ protected void runInContext() { return; } - if(!agentMgr.transferDirectAgentsFromMS(msHost.getUuid(), ManagementServerNode.getManagementServerId(), remainingMaintenanceWindowInMs())) { + if(!agentMgr.transferDirectAgentsFromMS(msHost.getUuid(), ManagementServerNode.getManagementServerId(), remainingMaintenanceWindowInMs(), ignoreMaintenanceHosts)) { logger.warn(String.format("Unable to prepare for maintenance, cannot transfer direct agents on this management server node %d (id: %s)", ManagementServerNode.getManagementServerId(), msHost.getUuid())); managementServerMaintenanceManager.cancelPreparingForMaintenance(msHost); managementServerMaintenanceManager.cancelWaitForPendingJobs(); @@ -648,5 +672,14 @@ private long remainingMaintenanceWindowInMs() { long remainingMaintenanceWindowTimeInMs = (ManagementServerMaintenanceTimeoutInMins.value().longValue() * 60 * 1000) - maintenanceElapsedTimeInMs; return (remainingMaintenanceWindowTimeInMs > 0) ? remainingMaintenanceWindowTimeInMs : 0; } + + private long totalAgentsInMs() { + /* Any Host in Maintenance state could block moving Management Server to Maintenance state, exclude those Hosts from total agents count + * To exclude maintenance states use values from ResourceState as source of truth + */ + List statesToExclude = ignoreMaintenanceHosts ? ResourceState.s_maintenanceStates : List.of(); + return hostDao.countHostsByMsResourceStateTypeAndHypervisorType(ManagementServerNode.getManagementServerId(), statesToExclude, + IndirectAgentLBServiceImpl.agentValidHostTypes, null); + } } } diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/CancelMaintenanceManagementServerHostCommand.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/CancelMaintenanceManagementServerHostCommand.java index 50eb73b7bca2..0128e75783b9 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/CancelMaintenanceManagementServerHostCommand.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/CancelMaintenanceManagementServerHostCommand.java @@ -19,8 +19,18 @@ package org.apache.cloudstack.maintenance.command; public class CancelMaintenanceManagementServerHostCommand extends BaseShutdownManagementServerHostCommand { + boolean rebalance; public CancelMaintenanceManagementServerHostCommand(long msId) { super(msId); } + + public CancelMaintenanceManagementServerHostCommand(long msId, boolean rebalance) { + super(msId); + this.rebalance = rebalance; + } + + public boolean getRebalance() { + return rebalance; + } } diff --git a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/PrepareForMaintenanceManagementServerHostCommand.java b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/PrepareForMaintenanceManagementServerHostCommand.java index 8f2a4e62b32d..ad96454b0542 100644 --- a/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/PrepareForMaintenanceManagementServerHostCommand.java +++ b/plugins/maintenance/src/main/java/org/apache/cloudstack/maintenance/command/PrepareForMaintenanceManagementServerHostCommand.java @@ -20,17 +20,23 @@ public class PrepareForMaintenanceManagementServerHostCommand extends BaseShutdownManagementServerHostCommand { String lbAlgorithm; + boolean forced; public PrepareForMaintenanceManagementServerHostCommand(long msId) { super(msId); } - public PrepareForMaintenanceManagementServerHostCommand(long msId, String lbAlgorithm) { + public PrepareForMaintenanceManagementServerHostCommand(long msId, String lbAlgorithm, boolean forced) { super(msId); this.lbAlgorithm = lbAlgorithm; + this.forced = forced; } public String getLbAlgorithm() { return lbAlgorithm; } + + public boolean isForced() { + return forced; + } } diff --git a/plugins/maintenance/src/test/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImplTest.java b/plugins/maintenance/src/test/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImplTest.java index 9fe33aa6c547..0cc88fa898f1 100644 --- a/plugins/maintenance/src/test/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImplTest.java +++ b/plugins/maintenance/src/test/java/org/apache/cloudstack/maintenance/ManagementServerMaintenanceManagerImplTest.java @@ -305,11 +305,11 @@ public void triggerShutdownCmd() { @Test public void prepareForMaintenanceAndCancelFromMaintenanceState() { Mockito.doNothing().when(jobManagerMock).disableAsyncJobs(); - spy.prepareForMaintenance("static"); + spy.prepareForMaintenance("static", false); Mockito.verify(jobManagerMock).disableAsyncJobs(); Assert.assertThrows(CloudRuntimeException.class, () -> { - spy.prepareForMaintenance("static"); + spy.prepareForMaintenance("static", false); }); ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); @@ -324,11 +324,11 @@ public void prepareForMaintenanceAndCancelFromMaintenanceState() { @Test public void prepareForMaintenanceAndCancelFromPreparingForMaintenanceState() { Mockito.doNothing().when(jobManagerMock).disableAsyncJobs(); - spy.prepareForMaintenance("static"); + spy.prepareForMaintenance("static", false); Mockito.verify(jobManagerMock).disableAsyncJobs(); Assert.assertThrows(CloudRuntimeException.class, () -> { - spy.prepareForMaintenance("static"); + spy.prepareForMaintenance("static", false); }); ManagementServerHostVO msHost = mock(ManagementServerHostVO.class); @@ -455,7 +455,7 @@ public void prepareForMaintenanceCmdNoIndirectMsHosts() { Mockito.when(msHostDao.listNonUpStateMsIPs()).thenReturn(new ArrayList<>()); PrepareForMaintenanceCmd cmd = mock(PrepareForMaintenanceCmd.class); Mockito.when(cmd.getManagementServerId()).thenReturn(1L); - Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong())).thenReturn(true); + Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong(), anyBoolean())).thenReturn(true); Mockito.when(indirectAgentLBMock.getManagementServerList()).thenReturn(new ArrayList<>()); Assert.assertThrows(CloudRuntimeException.class, () -> { @@ -476,7 +476,7 @@ public void prepareForMaintenanceCmdNullResponseFromClusterManager() { Mockito.when(msHostDao.findById(1L)).thenReturn(msHost1); PrepareForMaintenanceCmd cmd = mock(PrepareForMaintenanceCmd.class); Mockito.when(cmd.getManagementServerId()).thenReturn(1L); - Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong())).thenReturn(false); + Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong(), anyBoolean())).thenReturn(false); Mockito.when(clusterManagerMock.execute(anyString(), anyLong(), anyString(), anyBoolean())).thenReturn(null); Assert.assertThrows(CloudRuntimeException.class, () -> { @@ -497,7 +497,7 @@ public void prepareForMaintenanceCmdFailedResponseFromClusterManager() { Mockito.when(msHostDao.findById(1L)).thenReturn(msHost1); PrepareForMaintenanceCmd cmd = mock(PrepareForMaintenanceCmd.class); Mockito.when(cmd.getManagementServerId()).thenReturn(1L); - Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong())).thenReturn(false); + Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong(), anyBoolean())).thenReturn(false); Mockito.when(clusterManagerMock.execute(anyString(), anyLong(), anyString(), anyBoolean())).thenReturn("Failed"); Assert.assertThrows(CloudRuntimeException.class, () -> { @@ -518,7 +518,7 @@ public void prepareForMaintenanceCmdSuccessResponseFromClusterManager() { Mockito.when(msHostDao.findById(1L)).thenReturn(msHost1); PrepareForMaintenanceCmd cmd = mock(PrepareForMaintenanceCmd.class); Mockito.when(cmd.getManagementServerId()).thenReturn(1L); - Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong())).thenReturn(false); + Mockito.when(indirectAgentLBMock.haveAgentBasedHosts(anyLong(), anyBoolean())).thenReturn(false); Mockito.when(hostDao.listByMs(anyLong())).thenReturn(new ArrayList<>()); Mockito.when(clusterManagerMock.execute(anyString(), anyLong(), anyString(), anyBoolean())).thenReturn("Success"); diff --git a/server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java b/server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java index 908f3d7dad07..1891a4acfe7c 100644 --- a/server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java +++ b/server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java @@ -287,6 +287,7 @@ import com.cloud.user.dao.UserDao; import com.cloud.utils.NumbersUtil; import com.cloud.utils.Pair; +import com.cloud.utils.Ternary; import com.cloud.utils.UriUtils; import com.cloud.utils.component.ManagerBase; import com.cloud.utils.crypt.DBEncryptionUtil; @@ -631,20 +632,29 @@ private void initMessageBusListener() { messageBus.subscribe(EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, new MessageSubscriber() { @Override public void onPublishMessage(String serderAddress, String subject, Object args) { - String globalSettingUpdated = (String) args; - if (StringUtils.isEmpty(globalSettingUpdated)) { + Ternary settingUpdated = (Ternary) args; + String settingNameUpdated = settingUpdated.first(); + if (StringUtils.isEmpty(settingNameUpdated)) { return; } - if (globalSettingUpdated.equals(ApiServiceConfiguration.ManagementServerAddresses.key()) || - globalSettingUpdated.equals(IndirectAgentLBServiceImpl.IndirectAgentLBAlgorithm.key())) { - _indirectAgentLB.propagateMSListToAgents(); - } else if (globalSettingUpdated.equals(Config.RouterAggregationCommandEachTimeout.toString()) - || globalSettingUpdated.equals(Config.MigrateWait.toString())) { + if (settingNameUpdated.equals(ApiServiceConfiguration.ManagementServerAddresses.key()) || + settingNameUpdated.equals(IndirectAgentLBServiceImpl.IndirectAgentLBAlgorithm.key())) { + _indirectAgentLB.propagateMSListToAgents(false); + } else if (settingNameUpdated.equals(Config.RouterAggregationCommandEachTimeout.toString()) + || settingNameUpdated.equals(Config.MigrateWait.toString())) { Map params = new HashMap(); params.put(Config.RouterAggregationCommandEachTimeout.toString(), _configDao.getValue(Config.RouterAggregationCommandEachTimeout.toString())); params.put(Config.MigrateWait.toString(), _configDao.getValue(Config.MigrateWait.toString())); _agentManager.propagateChangeToAgents(params); - } else if (VMLeaseManager.InstanceLeaseEnabled.key().equals(globalSettingUpdated)) { + } else if (settingNameUpdated.equals(IndirectAgentLBServiceImpl.IndirectAgentLBCheckInterval.key())) { + ConfigKey.Scope scope = settingUpdated.second(); + if (scope == ConfigKey.Scope.Global) { + _indirectAgentLB.propagateMSListToAgents(false); + } else if (scope == ConfigKey.Scope.Cluster) { + Long clusterId = settingUpdated.third(); + _indirectAgentLB.propagateMSListToAgentsInCluster(clusterId); + } + } else if (VMLeaseManager.InstanceLeaseEnabled.key().equals(settingNameUpdated)) { vmLeaseManager.onLeaseFeatureToggle(); } } @@ -832,7 +842,8 @@ public String updateConfiguration(final long userId, final String name, final St CallContext.current().setEventResourceId(resourceId); CallContext.current().setEventDetails(String.format(" Name: %s, New Value: %s, Scope: %s", name, value, scope.name())); - _configDepot.invalidateConfigCache(name, scope, resourceId); + _configDepot.invalidateConfigCache(name, scopeVal, resourceId); + messageBus.publish(_name, EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, PublishScope.GLOBAL, new Ternary<>(name, scopeVal, resourceId)); return valueEncrypted ? DBEncryptionUtil.decrypt(value) : value; } @@ -927,7 +938,7 @@ public String updateConfiguration(final long userId, final String name, final St } txn.commit(); - messageBus.publish(_name, EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, PublishScope.GLOBAL, name); + messageBus.publish(_name, EventTypes.EVENT_CONFIGURATION_VALUE_EDIT, PublishScope.GLOBAL, new Ternary<>(name, ConfigKey.Scope.Global, resourceId)); return _configDao.getValue(name); } diff --git a/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java b/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java index 3336d44dba81..9c8a2fd5f8f6 100644 --- a/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java +++ b/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java @@ -30,6 +30,7 @@ import javax.inject.Inject; import javax.naming.ConfigurationException; +import com.cloud.dc.ClusterVO; import org.apache.cloudstack.agent.lb.algorithm.IndirectAgentLBRoundRobinAlgorithm; import org.apache.cloudstack.agent.lb.algorithm.IndirectAgentLBShuffleAlgorithm; import org.apache.cloudstack.agent.lb.algorithm.IndirectAgentLBStaticAlgorithm; @@ -89,7 +90,9 @@ public class IndirectAgentLBServiceImpl extends ComponentLifecycleBase implement private static final List agentValidResourceStates = List.of( ResourceState.Enabled, ResourceState.Maintenance, ResourceState.Disabled, ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance); - private static final List agentValidHostTypes = List.of(Host.Type.Routing, Host.Type.ConsoleProxy, + private static final List agentNonMaintenanceResourceStates = List.of( + ResourceState.Enabled, ResourceState.Disabled); + public static final List agentValidHostTypes = List.of(Host.Type.Routing, Host.Type.ConsoleProxy, Host.Type.SecondaryStorage, Host.Type.SecondaryStorageVM); private static final List agentNonRoutingHostTypes = List.of(Host.Type.ConsoleProxy, Host.Type.SecondaryStorage, Host.Type.SecondaryStorageVM); @@ -132,7 +135,7 @@ public List getManagementServerList(final Long hostId, final Long dcId, final org.apache.cloudstack.agent.lb.IndirectAgentLBAlgorithm algorithm = getAgentMSLBAlgorithm(lbAlgorithm); List hostIdList = orderedHostIdList; if (hostIdList == null) { - hostIdList = algorithm.isHostListNeeded() ? getOrderedHostIdList(dcId) : new ArrayList<>(); + hostIdList = algorithm.isHostListNeeded() ? getOrderedHostIdList(dcId, false) : new ArrayList<>(); } // just in case we have a host in creating state make sure it is in the list: @@ -167,8 +170,8 @@ public Long getLBPreferredHostCheckInterval(final Long clusterId) { return IndirectAgentLBCheckInterval.valueIn(clusterId); } - List getOrderedHostIdList(final Long dcId) { - final List hostIdList = getAllAgentBasedHostsFromDB(dcId, null); + List getOrderedHostIdList(final Long dcId, boolean excludeHostsInMaintenance) { + final List hostIdList = getAllAgentBasedHostsFromDB(dcId, null, null, excludeHostsInMaintenance); hostIdList.sort(Comparator.comparingLong(x -> x)); return hostIdList; } @@ -259,19 +262,25 @@ private List getAllAgentBasedNonRoutingHostsFromDB(final Long zoneId, fina agentValidResourceStates, agentNonRoutingHostTypes, agentValidHypervisorTypes); } - private List getAllAgentBasedRoutingHostsFromDB(final Long zoneId, final Long clusterId, final Long msId) { + private List getAllAgentBasedRoutingHostsFromDB(final Long zoneId, final Long clusterId, final Long msId, boolean excludeHostsInMaintenance) { + List validResourceStates = excludeHostsInMaintenance ? agentNonMaintenanceResourceStates : agentValidResourceStates; return hostDao.findHostIdsByZoneClusterResourceStateTypeAndHypervisorType(zoneId, clusterId, msId, - agentValidResourceStates, List.of(Host.Type.Routing), agentValidHypervisorTypes); + validResourceStates, List.of(Host.Type.Routing), agentValidHypervisorTypes); } - private List getAllAgentBasedHostsFromDB(final Long zoneId, final Long clusterId) { + private List getAllAgentBasedHostsFromDB(final Long zoneId, final Long clusterId, final Long msId, boolean excludeHostsInMaintenance) { + List validResourceStates = excludeHostsInMaintenance ? agentNonMaintenanceResourceStates : agentValidResourceStates; return hostDao.findHostIdsByZoneClusterResourceStateTypeAndHypervisorType(zoneId, clusterId, null, - agentValidResourceStates, agentValidHostTypes, agentValidHypervisorTypes); + validResourceStates, agentValidHostTypes, agentValidHypervisorTypes); + } + + private List getAllAgentBasedHosts(long msId, boolean excludeHostsInMaintenance) { + return getAllAgentBasedHostsFromDB(null, null, msId, excludeHostsInMaintenance); } @Override - public boolean haveAgentBasedHosts(long msId) { - return CollectionUtils.isNotEmpty(getAllAgentBasedHosts(msId)); + public boolean haveAgentBasedHosts(long msId, boolean excludeHostsInMaintenance) { + return CollectionUtils.isNotEmpty(getAllAgentBasedHosts(msId, excludeHostsInMaintenance)); } private org.apache.cloudstack.agent.lb.IndirectAgentLBAlgorithm getAgentMSLBAlgorithm() { @@ -303,8 +312,8 @@ public void checkLBAlgorithmName(String lbAlgorithm) { //////////////////////////////////////////////////////////// @Override - public void propagateMSListToAgents() { - logger.debug("Propagating management server list update to agents"); + public void propagateMSListToAgents(boolean triggerHostLB) { + logger.debug("Propagating management server list update to the agents"); ExecutorService setupMSListExecutorService = Executors.newFixedThreadPool(10, new NamedThreadFactory("SetupMSList-Worker")); final String lbAlgorithm = getLBAlgorithmName(); final Long globalLbCheckInterval = getLBPreferredHostCheckInterval(null); @@ -316,20 +325,20 @@ public void propagateMSListToAgents() { Map> clusterHostIdsMap = new HashMap<>(); List clusterIds = clusterDao.listAllClusterIds(zone.getId()); for (Long clusterId : clusterIds) { - List hostIds = getAllAgentBasedRoutingHostsFromDB(zone.getId(), clusterId, null); + List hostIds = getAllAgentBasedRoutingHostsFromDB(zone.getId(), clusterId, null, false); clusterHostIdsMap.put(clusterId, hostIds); zoneHostIds.addAll(hostIds); } zoneHostIds.sort(Comparator.comparingLong(x -> x)); final List avoidMsList = mshostDao.listNonUpStateMsIPs(); for (Long nonRoutingHostId : nonRoutingHostIds) { - setupMSListExecutorService.submit(new SetupMSListTask(nonRoutingHostId, zone.getId(), zoneHostIds, avoidMsList, lbAlgorithm, globalLbCheckInterval)); + setupMSListExecutorService.submit(new SetupMSListTask(nonRoutingHostId, zone.getId(), zoneHostIds, avoidMsList, lbAlgorithm, globalLbCheckInterval, triggerHostLB)); } for (Long clusterId : clusterIds) { final Long clusterLbCheckInterval = getLBPreferredHostCheckInterval(clusterId); List hostIds = clusterHostIdsMap.get(clusterId); for (Long hostId : hostIds) { - setupMSListExecutorService.submit(new SetupMSListTask(hostId, zone.getId(), zoneHostIds, avoidMsList, lbAlgorithm, clusterLbCheckInterval)); + setupMSListExecutorService.submit(new SetupMSListTask(hostId, zone.getId(), zoneHostIds, avoidMsList, lbAlgorithm, clusterLbCheckInterval, triggerHostLB)); } } } @@ -345,6 +354,45 @@ public void propagateMSListToAgents() { } } + @Override + public void propagateMSListToAgentsInCluster(Long clusterId) { + if (clusterId == null) { + return; + } + + logger.debug("Propagating management server list to the agents in cluster " + clusterId); + ClusterVO cluster = clusterDao.findById(clusterId); + if (cluster == null) { + logger.warn("Unable to propagate management server list, couldn't find cluster " + clusterId); + return; + } + DataCenterVO zone = dataCenterDao.findById(cluster.getDataCenterId()); + if (zone == null) { + logger.warn("Unable to propagate management server list, couldn't find zone of the cluster " + clusterId); + return; + } + + ExecutorService setupMSListInClusterExecutorService = Executors.newFixedThreadPool(10, new NamedThreadFactory("SetupMSListInCluster-Worker")); + final String lbAlgorithm = getLBAlgorithmName(); + List clusterHostIds = getAllAgentBasedRoutingHostsFromDB(zone.getId(), clusterId, null, false); + clusterHostIds.sort(Comparator.comparingLong(x -> x)); + final List avoidMsList = mshostDao.listNonUpStateMsIPs(); + final Long clusterLbCheckInterval = getLBPreferredHostCheckInterval(clusterId); + for (Long hostId : clusterHostIds) { + setupMSListInClusterExecutorService.submit(new SetupMSListTask(hostId, zone.getId(), clusterHostIds, avoidMsList, lbAlgorithm, clusterLbCheckInterval, false)); + } + + setupMSListInClusterExecutorService.shutdown(); + try { + if (!setupMSListInClusterExecutorService.awaitTermination(300, TimeUnit.SECONDS)) { + setupMSListInClusterExecutorService.shutdownNow(); + } + } catch (InterruptedException e) { + setupMSListInClusterExecutorService.shutdownNow(); + logger.debug(String.format("Force shutdown setup ms list in cluster service as it did not shutdown in the desired time due to: %s", e.getMessage())); + } + } + private final class SetupMSListTask extends ManagedContextRunnable { private Long hostId; private Long dcId; @@ -352,21 +400,23 @@ private final class SetupMSListTask extends ManagedContextRunnable { private List avoidMsList; private String lbAlgorithm; private Long lbCheckInterval; + private Boolean triggerHostLb; public SetupMSListTask(Long hostId, Long dcId, List orderedHostIdList, List avoidMsList, - String lbAlgorithm, Long lbCheckInterval) { + String lbAlgorithm, Long lbCheckInterval, Boolean triggerHostLb) { this.hostId = hostId; this.dcId = dcId; this.orderedHostIdList = orderedHostIdList; this.avoidMsList = avoidMsList; this.lbAlgorithm = lbAlgorithm; this.lbCheckInterval = lbCheckInterval; + this.triggerHostLb = triggerHostLb; } @Override protected void runInContext() { final List msList = getManagementServerList(hostId, dcId, orderedHostIdList); - final SetupMSListCommand cmd = new SetupMSListCommand(msList, avoidMsList, lbAlgorithm, lbCheckInterval); + final SetupMSListCommand cmd = new SetupMSListCommand(msList, avoidMsList, lbAlgorithm, lbCheckInterval, triggerHostLb); cmd.setWait(60); final Answer answer = agentManager.easySend(hostId, cmd); if (answer == null || !answer.getResult()) { @@ -419,9 +469,9 @@ protected boolean migrateNonRoutingHostAgentsInZone(String fromMsUuid, long from protected boolean migrateRoutingHostAgentsInCluster(long clusterId, String fromMsUuid, long fromMsId, DataCenter dc, long migrationStartTimeInMs, long timeoutDurationInMs, final List avoidMsList, String lbAlgorithm, - boolean lbAlgorithmChanged, List orderedHostIdList) { + boolean lbAlgorithmChanged, List orderedHostIdList, boolean excludeHostsInMaintenance) { - List agentBasedHostsOfMsInDcAndCluster = getAllAgentBasedRoutingHostsFromDB(dc.getId(), clusterId, fromMsId); + List agentBasedHostsOfMsInDcAndCluster = getAllAgentBasedRoutingHostsFromDB(dc.getId(), clusterId, fromMsId, excludeHostsInMaintenance); if (CollectionUtils.isEmpty(agentBasedHostsOfMsInDcAndCluster)) { return true; } @@ -461,7 +511,7 @@ protected boolean migrateRoutingHostAgentsInCluster(long clusterId, String fromM } @Override - public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs) { + public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs, boolean excludeHostsInMaintenance) { if (timeoutDurationInMs <= 0) { logger.debug(String.format("Not migrating indirect agents from management server node %d (id: %s) to other nodes, invalid timeout duration", fromMsId, fromMsUuid)); return false; @@ -469,7 +519,7 @@ public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorith logger.debug(String.format("Migrating indirect agents from management server node %d (id: %s) to other nodes", fromMsId, fromMsUuid)); long migrationStartTimeInMs = System.currentTimeMillis(); - if (!haveAgentBasedHosts(fromMsId)) { + if (!haveAgentBasedHosts(fromMsId, excludeHostsInMaintenance)) { logger.info(String.format("No indirect agents available on management server node %d (id: %s), to migrate", fromMsId, fromMsUuid)); return true; } @@ -489,7 +539,7 @@ public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorith List dataCenterList = dcDao.listAll(); for (DataCenterVO dc : dataCenterList) { if (!migrateAgentsInZone(dc, fromMsUuid, fromMsId, avoidMsList, lbAlgorithm, lbAlgorithmChanged, - migrationStartTimeInMs, timeoutDurationInMs)) { + migrationStartTimeInMs, timeoutDurationInMs, excludeHostsInMaintenance)) { return false; } } @@ -498,8 +548,8 @@ public boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorith } private boolean migrateAgentsInZone(DataCenterVO dc, String fromMsUuid, long fromMsId, List avoidMsList, - String lbAlgorithm, boolean lbAlgorithmChanged, long migrationStartTimeInMs, long timeoutDurationInMs) { - List orderedHostIdList = getOrderedHostIdList(dc.getId()); + String lbAlgorithm, boolean lbAlgorithmChanged, long migrationStartTimeInMs, long timeoutDurationInMs, boolean excludeHostsInMaintenance) { + List orderedHostIdList = getOrderedHostIdList(dc.getId(), excludeHostsInMaintenance); if (!migrateNonRoutingHostAgentsInZone(fromMsUuid, fromMsId, dc, migrationStartTimeInMs, timeoutDurationInMs, avoidMsList, lbAlgorithm, lbAlgorithmChanged, orderedHostIdList)) { return false; @@ -507,7 +557,7 @@ private boolean migrateAgentsInZone(DataCenterVO dc, String fromMsUuid, long fro List clusterIds = clusterDao.listAllClusterIds(dc.getId()); for (Long clusterId : clusterIds) { if (!migrateRoutingHostAgentsInCluster(clusterId, fromMsUuid, fromMsId, dc, migrationStartTimeInMs, - timeoutDurationInMs, avoidMsList, lbAlgorithm, lbAlgorithmChanged, orderedHostIdList)) { + timeoutDurationInMs, avoidMsList, lbAlgorithm, lbAlgorithmChanged, orderedHostIdList, excludeHostsInMaintenance)) { return false; } } @@ -547,7 +597,9 @@ protected void runInContext() { final MigrateAgentConnectionCommand cmd = new MigrateAgentConnectionCommand(msList, avoidMsList, lbAlgorithm, lbCheckInterval); cmd.setWait(60); final Answer answer = agentManager.easySend(hostId, cmd); //may not receive answer when the agent disconnects immediately and try reconnecting to other ms host - if (answer != null && !answer.getResult()) { + if (answer == null) { + logger.warn(String.format("Got empty answer while initiating migration of agent connection for host agent ID: %d", hostId)); + } else if (!answer.getResult()) { logger.warn(String.format("Error while initiating migration of agent connection for host agent ID: %d - %s", hostId, answer.getDetails())); } updateLastManagementServer(hostId, fromMsId); diff --git a/server/src/test/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImplTest.java b/server/src/test/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImplTest.java index 1b9923ad3ea1..9cdcce8008e9 100644 --- a/server/src/test/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImplTest.java +++ b/server/src/test/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImplTest.java @@ -204,7 +204,7 @@ public void testExceptionOnEmptyHostSetting() throws NoSuchFieldException, Illeg public void testGetOrderedRunningHostIdsEmptyList() { doReturn(Collections.emptyList()).when(hostDao).findHostIdsByZoneClusterResourceStateTypeAndHypervisorType( Mockito.eq(DC_1_ID), Mockito.eq(null), Mockito.eq(null), Mockito.anyList(), Mockito.anyList(), Mockito.anyList()); - Assert.assertTrue(agentMSLB.getOrderedHostIdList(DC_1_ID).isEmpty()); + Assert.assertTrue(agentMSLB.getOrderedHostIdList(DC_1_ID, false).isEmpty()); } @Test @@ -213,6 +213,6 @@ public void testGetOrderedRunningHostIdsOrderList() { .findHostIdsByZoneClusterResourceStateTypeAndHypervisorType(Mockito.eq(DC_1_ID), Mockito.eq(null), Mockito.eq(null), Mockito.anyList(), Mockito.anyList(), Mockito.anyList()); Assert.assertEquals(Arrays.asList(host1.getId(), host2.getId(), host3.getId(), host4.getId()), - agentMSLB.getOrderedHostIdList(DC_1_ID)); + agentMSLB.getOrderedHostIdList(DC_1_ID, false)); } } diff --git a/ui/public/locales/en.json b/ui/public/locales/en.json index b3d5536e1f15..753fb84ba028 100644 --- a/ui/public/locales/en.json +++ b/ui/public/locales/en.json @@ -1876,6 +1876,7 @@ "label.read.io": "Read (IO)", "label.readonly": "Read-Only", "label.reason": "Reason", +"label.rebalance": "Rebalance", "label.reboot": "Reboot", "label.recent.deliveries": "Recent deliveries", "label.receivedbytes": "Bytes received", diff --git a/ui/src/config/section/infra/managementServers.js b/ui/src/config/section/infra/managementServers.js index bd17a4b8d5aa..28e99069cf52 100644 --- a/ui/src/config/section/infra/managementServers.js +++ b/ui/src/config/section/infra/managementServers.js @@ -75,6 +75,7 @@ export default { message: 'message.cancel.maintenance', dataView: true, popup: true, + args: ['rebalance'], show: (record, store) => { return ['PreparingForMaintenance', 'Maintenance'].includes(record.state) }, mapping: { managementserverid: { diff --git a/ui/src/views/infra/Confirmation.vue b/ui/src/views/infra/Confirmation.vue index ea166ac32191..ea7b841522db 100644 --- a/ui/src/views/infra/Confirmation.vue +++ b/ui/src/views/infra/Confirmation.vue @@ -45,6 +45,12 @@ + + + +