Skip to content

Commit be437b0

Browse files
sureshanapartidhslove
authored andcommitted
Management Server - Prepare for Maintenance and Cancel Maintenance improvements (apache#10995)
* Management Server - Prepare for Maintenance and Cancel Maintenance improvements: - Added new setting 'management.server.maintenance.ignore.maintenance.hosts' to ignore hosts in maintenance states while preparing management server for maintenance. This skips agent transfer and agents count check for hosts in maintenance. - Rebalance indirect agents after cancel maintenance, using rebalance parameter in cancelMaintenance API - Force maintenance after maintenance window timeout, using forced parameter in prepareForMaintenance API. - Propagate 'indirect.agent.lb.check.interval' setting change to the host agents. * rebases fixes * code improvements, cleanup * [UI] Set rebalance true by default in cancel maintenance dialog * Update MS state after executing cluster cmd in the target MS, and some code improvements * code improvements * Ensure the host lb algorithm 'shuffle' is applied once before disabling the indirect agent lb check background task
1 parent d2bac60 commit be437b0

File tree

26 files changed

+412
-148
lines changed

26 files changed

+412
-148
lines changed

agent/src/main/java/com/cloud/agent/Agent.java

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -453,22 +453,30 @@ private void scheduleCertificateRenewalTask() {
453453
certExecutor.schedule(new PostCertificateRenewalTask(this), 5, TimeUnit.SECONDS);
454454
}
455455

456-
private void scheduleHostLBCheckerTask(final long checkInterval) {
456+
private void scheduleHostLBCheckerTask(final String lbAlgorithm, final long checkInterval) {
457457
String name = "HostLBCheckerTask";
458458
if (hostLbCheckExecutor != null && !hostLbCheckExecutor.isShutdown()) {
459+
logger.info("Shutting down the preferred host checker task {}", name);
459460
hostLbCheckExecutor.shutdown();
460461
try {
461462
if (!hostLbCheckExecutor.awaitTermination(1, TimeUnit.SECONDS)) {
462463
hostLbCheckExecutor.shutdownNow();
463464
}
464465
} catch (InterruptedException e) {
465-
logger.debug("Forcing {} shutdown as it did not shutdown in the desired time due to: {}",
466+
logger.debug("Forcing the preferred host checker task {} shutdown as it did not shutdown in the desired time due to: {}",
466467
name, e.getMessage());
467468
hostLbCheckExecutor.shutdownNow();
468469
}
469470
}
470471
if (checkInterval > 0L) {
471-
logger.info("Scheduling preferred host task with host.lb.interval={}ms", checkInterval);
472+
if ("shuffle".equalsIgnoreCase(lbAlgorithm)) {
473+
logger.info("Scheduling the preferred host checker task to trigger once (to apply lb algorithm '{}') after host.lb.interval={} ms", lbAlgorithm, checkInterval);
474+
hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory(name)));
475+
hostLbCheckExecutor.schedule(new PreferredHostCheckerTask(), checkInterval, TimeUnit.MILLISECONDS);
476+
return;
477+
}
478+
479+
logger.info("Scheduling a recurring preferred host checker task with lb algorithm '{}' and host.lb.interval={} ms", lbAlgorithm, checkInterval);
472480
hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory(name)));
473481
hostLbCheckExecutor.scheduleAtFixedRate(new PreferredHostCheckerTask(), checkInterval, checkInterval,
474482
TimeUnit.MILLISECONDS);
@@ -925,7 +933,7 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) {
925933
return new SetupCertificateAnswer(true);
926934
}
927935

928-
private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) {
936+
private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final boolean triggerHostLB) {
929937
if (CollectionUtils.isNotEmpty(msList) && StringUtils.isNotEmpty(lbAlgorithm)) {
930938
try {
931939
final String newMSHosts = String.format("%s%s%s", com.cloud.utils.StringUtils.toCSVList(msList), IAgentShell.hostLbAlgorithmSeparator, lbAlgorithm);
@@ -938,22 +946,24 @@ private void processManagementServerList(final List<String> msList, final List<S
938946
}
939947
}
940948
shell.setAvoidHosts(avoidMsList);
941-
if ("shuffle".equals(lbAlgorithm)) {
942-
scheduleHostLBCheckerTask(0);
943-
} else {
944-
scheduleHostLBCheckerTask(shell.getLbCheckerInterval(lbCheckInterval));
949+
if (triggerHostLB) {
950+
logger.info("Triggering the preferred host checker task now");
951+
ScheduledExecutorService hostLbExecutor = Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("HostLB-Executor"));
952+
hostLbExecutor.schedule(new PreferredHostCheckerTask(), 0, TimeUnit.MILLISECONDS);
953+
hostLbExecutor.shutdown();
945954
}
955+
scheduleHostLBCheckerTask(lbAlgorithm, shell.getLbCheckerInterval(lbCheckInterval));
946956
}
947957

948958
private Answer setupManagementServerList(final SetupMSListCommand cmd) {
949-
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
959+
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval(), cmd.getTriggerHostLb());
950960
return new SetupMSListAnswer(true);
951961
}
952962

953963
private Answer migrateAgentToOtherMS(final MigrateAgentConnectionCommand cmd) {
954964
try {
955965
if (CollectionUtils.isNotEmpty(cmd.getMsList())) {
956-
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
966+
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval(), false);
957967
}
958968
Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("MigrateAgentConnection-Job")).schedule(() -> {
959969
migrateAgentConnection(cmd.getAvoidMsList());
@@ -1043,7 +1053,7 @@ public void processReadyCommand(final Command cmd) {
10431053
}
10441054

10451055
verifyAgentArch(ready.getArch());
1046-
processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval());
1056+
processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval(), false);
10471057

10481058
logger.info("Ready command is processed for agent [id: {}, uuid: {}, name: {}]", getId(), getUuid(), getName());
10491059
}

api/src/main/java/com/cloud/exception/OperationTimedoutException.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ public class OperationTimedoutException extends CloudException {
4040
boolean _isActive;
4141

4242
public OperationTimedoutException(Command[] cmds, long agentId, long seqId, int time, boolean isActive) {
43-
super("Commands " + seqId + " to Host " + agentId + " timed out after " + time);
43+
super("Commands " + seqId + " to Host " + agentId + " timed out after " + time + " secs");
4444
_agentId = agentId;
4545
_seqId = seqId;
4646
_time = time;

api/src/main/java/com/cloud/resource/ResourceState.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ public static Event toEvent(String e) {
7676
}
7777
}
7878

79+
public static List<ResourceState> s_maintenanceStates = List.of(ResourceState.Maintenance,
80+
ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance,
81+
ResourceState.ErrorInPrepareForMaintenance);
82+
7983
public ResourceState getNextState(Event a) {
8084
return s_fsm.getNextState(this, a);
8185
}
@@ -98,8 +102,7 @@ public static String[] toString(ResourceState... states) {
98102
}
99103

100104
public static boolean isMaintenanceState(ResourceState state) {
101-
return Arrays.asList(ResourceState.Maintenance, ResourceState.ErrorInMaintenance,
102-
ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance).contains(state);
105+
return s_maintenanceStates.contains(state);
103106
}
104107

105108
public static boolean canAttemptMaintenance(ResourceState state) {

api/src/main/java/org/apache/cloudstack/api/ApiConstants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ public class ApiConstants {
448448
public static final String PUBLIC_END_PORT = "publicendport";
449449
public static final String PUBLIC_ZONE = "publiczone";
450450
public static final String PURGE_RESOURCES = "purgeresources";
451+
public static final String REBALANCE = "rebalance";
451452
public static final String RECEIVED_BYTES = "receivedbytes";
452453
public static final String RECONNECT = "reconnect";
453454
public static final String RECOVER = "recover";

api/src/main/java/org/apache/cloudstack/api/command/admin/systemvm/PatchSystemVMCmd.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public class PatchSystemVMCmd extends BaseAsyncCmd {
4646
@Parameter(name = ApiConstants.FORCED, type = CommandType.BOOLEAN,
4747
description = "If true, initiates copy of scripts and restart of the agent, even if the scripts version matches." +
4848
"To be used with ID parameter only")
49-
private Boolean force;
49+
private Boolean forced;
5050

5151
/////////////////////////////////////////////////////
5252
/////////////////// Accessors ///////////////////////
@@ -58,7 +58,7 @@ public Long getId() {
5858
}
5959

6060
public boolean isForced() {
61-
return force != null && force;
61+
return forced != null && forced;
6262
}
6363

6464
/////////////////////////////////////////////////////

core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,15 @@ public class SetupMSListCommand extends Command {
2929
private List<String> avoidMsList;
3030
private String lbAlgorithm;
3131
private Long lbCheckInterval;
32+
private Boolean triggerHostLb;
3233

33-
public SetupMSListCommand(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) {
34+
public SetupMSListCommand(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final Boolean triggerHostLb) {
3435
super();
3536
this.msList = msList;
3637
this.avoidMsList = avoidMsList;
3738
this.lbAlgorithm = lbAlgorithm;
3839
this.lbCheckInterval = lbCheckInterval;
40+
this.triggerHostLb = triggerHostLb;
3941
}
4042

4143
public List<String> getMsList() {
@@ -54,9 +56,12 @@ public Long getLbCheckInterval() {
5456
return lbCheckInterval;
5557
}
5658

59+
public boolean getTriggerHostLb() {
60+
return triggerHostLb;
61+
}
62+
5763
@Override
5864
public boolean executeInSequence() {
5965
return false;
6066
}
61-
6267
}

engine/components-api/src/main/java/com/cloud/agent/AgentManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,5 +171,5 @@ enum TapAgentsAction {
171171

172172
void propagateChangeToAgents(Map<String, String> params);
173173

174-
boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs);
174+
boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance);
175175
}

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,6 @@ public boolean configure(final String name, final Map<String, Object> params) th
274274

275275
_executor = new ThreadPoolExecutor(agentTaskThreads, agentTaskThreads, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(), new NamedThreadFactory("AgentTaskPool")); _executor.allowCoreThreadTimeOut(true);
276276

277-
initConnectExecutor();
278-
279277
maxConcurrentNewAgentConnections = RemoteAgentMaxConcurrentNewConnections.value();
280278

281279
_connection = new NioServer("AgentManager", Port.value(), Workers.value() + 10,
@@ -856,6 +854,7 @@ public boolean start() {
856854
return true;
857855
}
858856

857+
initConnectExecutor();
859858
startDirectlyConnectedHosts(false);
860859

861860
if (_connection != null) {
@@ -2223,7 +2222,7 @@ public void propagateChangeToAgents(Map<String, String> params) {
22232222
}
22242223

22252224
@Override
2226-
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) {
2225+
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) {
22272226
return true;
22282227
}
22292228

engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import javax.net.ssl.SSLContext;
4343
import javax.net.ssl.SSLEngine;
4444

45+
import com.cloud.resource.ResourceState;
4546
import org.apache.cloudstack.ca.CAManager;
4647
import org.apache.cloudstack.framework.config.ConfigDepot;
4748
import org.apache.cloudstack.framework.config.ConfigKey;
@@ -431,10 +432,10 @@ public boolean routeToPeer(final String peer, final byte[] bytes) {
431432
ch = connectToPeer(peer, ch);
432433
if (ch == null) {
433434
try {
434-
logD(bytes, "Unable to route to peer: " + Request.parse(bytes).toString());
435+
logD(bytes, "Unable to establish connection to route to peer: " + Request.parse(bytes));
435436
} catch (ClassNotFoundException | UnsupportedVersionException e) {
436437
// Request.parse thrown exception when we try to log it, log as much as we can
437-
logD(bytes, "Unable to route to peer, and Request.parse further caught exception" + e.getMessage());
438+
logD(bytes, "Unable to establish connection to route to peer, and Request.parse further caught exception" + e.getMessage());
438439
}
439440
return false;
440441
}
@@ -644,7 +645,6 @@ protected void doTask(final Task task) throws TaskExecutionException {
644645
final Link link = task.getLink();
645646

646647
if (Request.fromServer(data)) {
647-
648648
final AgentAttache agent = findAttache(hostId);
649649

650650
if (Request.isControl(data)) {
@@ -693,7 +693,6 @@ protected void doTask(final Task task) throws TaskExecutionException {
693693
cancel(Long.toString(Request.getManagementServerId(data)), hostId, Request.getSequence(data), e.getMessage());
694694
}
695695
} else {
696-
697696
final long mgmtId = Request.getManagementServerId(data);
698697
if (mgmtId != -1 && mgmtId != _nodeId) {
699698
routeToPeer(Long.toString(mgmtId), data);
@@ -1362,7 +1361,7 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS
13621361
if (cmd instanceof PrepareForMaintenanceManagementServerHostCommand) {
13631362
logger.debug("Received PrepareForMaintenanceManagementServerHostCommand - preparing for maintenance");
13641363
try {
1365-
managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm());
1364+
managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm(), ((PrepareForMaintenanceManagementServerHostCommand) cmd).isForced());
13661365
return "Successfully prepared for maintenance";
13671366
} catch(CloudRuntimeException e) {
13681367
return e.getMessage();
@@ -1409,15 +1408,15 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS
14091408
}
14101409

14111410
@Override
1412-
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) {
1411+
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) {
14131412
if (timeoutDurationInMs <= 0) {
14141413
logger.debug(String.format("Not transferring direct agents from management server node %d (id: %s) to other nodes, invalid timeout duration", fromMsId, fromMsUuid));
14151414
return false;
14161415
}
14171416

14181417
long transferStartTimeInMs = System.currentTimeMillis();
1419-
if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId))) {
1420-
logger.info(String.format("No direct agent hosts available on management server node %d (id: %s), to transfer", fromMsId, fromMsUuid));
1418+
if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId, excludeHostsInMaintenance))) {
1419+
logger.info("No direct agent hosts available on management server node {} (id: {}), to transfer", fromMsId, fromMsUuid);
14211420
return true;
14221421
}
14231422

@@ -1431,7 +1430,7 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long
14311430
int agentTransferFailedCount = 0;
14321431
List<DataCenterVO> dataCenterList = dcDao.listAll();
14331432
for (DataCenterVO dc : dataCenterList) {
1434-
List<HostVO> directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId());
1433+
List<HostVO> directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId(), excludeHostsInMaintenance);
14351434
if (CollectionUtils.isEmpty(directAgentHostsInDc)) {
14361435
continue;
14371436
}
@@ -1465,9 +1464,10 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long
14651464
return (agentTransferFailedCount == 0);
14661465
}
14671466

1468-
private List<HostVO> getDirectAgentHosts(long msId) {
1467+
private List<HostVO> getDirectAgentHosts(long msId, boolean excludeHostsInMaintenance) {
14691468
List<HostVO> directAgentHosts = new ArrayList<>();
1470-
List<HostVO> hosts = _hostDao.listHostsByMs(msId);
1469+
List<ResourceState> statesToExclude = excludeHostsInMaintenance ? ResourceState.s_maintenanceStates : List.of();
1470+
List<HostVO> hosts = _hostDao.listHostsByMsResourceState(msId, statesToExclude);
14711471
for (HostVO host : hosts) {
14721472
AgentAttache agent = findAttache(host.getId());
14731473
if (agent != null && agent instanceof DirectAgentAttache) {
@@ -1478,9 +1478,11 @@ private List<HostVO> getDirectAgentHosts(long msId) {
14781478
return directAgentHosts;
14791479
}
14801480

1481-
private List<HostVO> getDirectAgentHostsInDc(long msId, long dcId) {
1481+
private List<HostVO> getDirectAgentHostsInDc(long msId, long dcId, boolean excludeHostsInMaintenance) {
14821482
List<HostVO> directAgentHosts = new ArrayList<>();
1483-
List<HostVO> hosts = _hostDao.listHostsByMsAndDc(msId, dcId);
1483+
// To exclude maintenance states use values from ResourceState as source of truth
1484+
List<ResourceState> statesToExclude = excludeHostsInMaintenance ? ResourceState.s_maintenanceStates : List.of();
1485+
List<HostVO> hosts = _hostDao.listHostsByMsDcResourceState(msId, dcId, statesToExclude);
14841486
for (HostVO host : hosts) {
14851487
AgentAttache agent = findAttache(host.getId());
14861488
if (agent != null && agent instanceof DirectAgentAttache) {
@@ -1522,6 +1524,10 @@ public void onManagementServerPreparingForMaintenance() {
15221524
public void onManagementServerCancelPreparingForMaintenance() {
15231525
logger.debug("Management server cancel preparing for maintenance");
15241526
super.onManagementServerPreparingForMaintenance();
1527+
1528+
// needed for the case when Management Server in Preparing For Maintenance but didn't go to Maintenance state
1529+
// (where this variable will be reset)
1530+
_agentLbHappened = false;
15251531
}
15261532

15271533
@Override

engine/schema/src/main/java/com/cloud/host/dao/HostDao.java

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,14 +177,24 @@ public interface HostDao extends GenericDao<HostVO, Long>, StateDao<Status, Stat
177177

178178
List<HostVO> listHostsByMsAndDc(long msId, long dcId);
179179

180+
List<HostVO> listHostsByMsDcResourceState(long msId, long dcId, List<ResourceState> excludedResourceStates);
181+
180182
List<HostVO> listHostsByMs(long msId);
181183

184+
List<HostVO> listHostsByMsResourceState(long msId, List<ResourceState> excludedResourceStates);
185+
182186
/**
183-
* Retrieves the number of hosts/agents this {@see ManagementServer} has responsibility over.
184-
* @param msId the id of the {@see ManagementServer}
185-
* @return the number of hosts/agents this {@see ManagementServer} has responsibility over
187+
* Count Hosts by given Management Server, Host and Hypervisor Types,
188+
* and exclude Hosts with given Resource States.
189+
*
190+
* @param msId Management Server Id
191+
* @param excludedResourceStates Resource States to be excluded
192+
* @param hostTypes Host Types
193+
* @param hypervisorTypes Hypervisor Types
194+
* @return Hosts count
186195
*/
187-
int countByMs(long msId);
196+
int countHostsByMsResourceStateTypeAndHypervisorType(long msId, List<ResourceState> excludedResourceStates,
197+
List<Type> hostTypes, List<HypervisorType> hypervisorTypes);
188198

189199
/**
190200
* Retrieves the host ids/agents this {@see ManagementServer} has responsibility over.

0 commit comments

Comments
 (0)