Skip to content

Commit 0817b69

Browse files
sureshanapartidhslove
authored andcommitted
MS maintenance improvements (apache#10417)
* Update last agents during ms maintenance, and some code improvements * Send 503 (Service Unavailable) response status when maintenance or shutdown is initiated [Any load balancer in the clustered environment can avoid routing requests to this MS node] * Migrate systemvm agents before routing host agents, and some code improvements * Added events for ms maintenance and shutdown operations * Added the following ms maintenance and shutdown improvements - block new agent connections during prepare for maintenance of ms - maintain avoids ms list - propagate updated management servers list and lb algorithm in host and indirect.agent.lb.algorithm settings respectively, to systemvm (non-routing) agents - updated setup ms list and migrate agent connections to executor service - migrate agent connection through executor, and send the answer to the ms host that initiated the migration - re-initialize ssl handshake executor if it is shutdown - don't allow prepare for maintenance or shutdown when other management server nodes are in preparing states - don't allow trigger shutdown when management server is up and other management server nodes are in preparing states - stop agent connections monitor on ms maintenance - update avoid ms list in ready command - updated connected host from the client connection - update last agents in ms metrics from the database - updated some agent config descriptions - update last management server in the hosts during shutdown - added agents and lastagents in management server response - updated management server maintenance & shutdown unit tests - some code improvements * refactored code / addressed comments * removed shutdown testcase (maybe, calling System.exit) * Revert "removed shutdown testcase (maybe, calling System.exit)" This reverts commit e14b071. * avoid system.exit during shutdown test * code improvements * testcase fix * Fix cutoff time in agent connections monitor thread
1 parent 8a57308 commit 0817b69

File tree

35 files changed

+1307
-216
lines changed

35 files changed

+1307
-216
lines changed

agent/src/main/java/com/cloud/agent/Agent.java

Lines changed: 53 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ public void start() {
342342
logger.info("Attempted to connect to the server, but received an unexpected exception, trying again...", e);
343343
}
344344
}
345-
shell.updateConnectedHost();
345+
shell.updateConnectedHost(((NioClient)connection).getHost());
346346
scavengeOldAgentObjects();
347347
}
348348

@@ -617,15 +617,11 @@ public Task create(final Task.Type type, final Link link, final byte[] data) {
617617
}
618618

619619
protected void reconnect(final Link link) {
620-
reconnect(link, null, null, false);
620+
reconnect(link, null, false);
621621
}
622622

623-
protected void reconnect(final Link link, String preferredHost, List<String> avoidHostList, boolean forTransfer) {
623+
protected void reconnect(final Link link, String preferredMSHost, boolean forTransfer) {
624624
if (!(forTransfer || reconnectAllowed)) {
625-
return;
626-
}
627-
628-
if (!reconnectAllowed) {
629625
logger.debug("Reconnect requested but it is not allowed {}", () -> getLinkLog(link));
630626
return;
631627
}
@@ -637,19 +633,26 @@ protected void reconnect(final Link link, String preferredHost, List<String> avo
637633
serverResource.disconnected();
638634
logger.info("Lost connection to host: {}. Attempting reconnection while we still have {} commands in progress.", shell.getConnectedHost(), commandsInProgress.get());
639635
stopAndCleanupConnection(true);
636+
String host = preferredMSHost;
637+
if (org.apache.commons.lang3.StringUtils.isBlank(host)) {
638+
host = shell.getNextHost();
639+
}
640+
List<String> avoidMSHostList = shell.getAvoidHosts();
640641
do {
641-
final String host = shell.getNextHost();
642-
connection = new NioClient(getAgentName(), host, shell.getPort(), shell.getWorkers(), shell.getSslHandshakeTimeout(), this);
643-
logger.info("Reconnecting to host: {}", host);
644-
try {
645-
connection.start();
646-
} catch (final NioConnectionException e) {
647-
logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
648-
stopAndCleanupConnection(false);
642+
if (CollectionUtils.isEmpty(avoidMSHostList) || !avoidMSHostList.contains(host)) {
643+
connection = new NioClient(getAgentName(), host, shell.getPort(), shell.getWorkers(), shell.getSslHandshakeTimeout(), this);
644+
logger.info("Reconnecting to host: {}", host);
645+
try {
646+
connection.start();
647+
} catch (final NioConnectionException e) {
648+
logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
649+
stopAndCleanupConnection(false);
650+
}
649651
}
650652
shell.getBackoffAlgorithm().waitBeforeRetry();
653+
host = shell.getNextHost();
651654
} while (!connection.isStartup());
652-
shell.updateConnectedHost();
655+
shell.updateConnectedHost(((NioClient)connection).getHost());
653656
logger.info("Connected to the host: {}", shell.getConnectedHost());
654657
}
655658

@@ -919,7 +922,7 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) {
919922
return new SetupCertificateAnswer(true);
920923
}
921924

922-
private void processManagementServerList(final List<String> msList, final String lbAlgorithm, final Long lbCheckInterval) {
925+
private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) {
923926
if (CollectionUtils.isNotEmpty(msList) && StringUtils.isNotEmpty(lbAlgorithm)) {
924927
try {
925928
final String newMSHosts = String.format("%s%s%s", com.cloud.utils.StringUtils.toCSVList(msList), IAgentShell.hostLbAlgorithmSeparator, lbAlgorithm);
@@ -931,6 +934,7 @@ private void processManagementServerList(final List<String> msList, final String
931934
throw new CloudRuntimeException("Could not persist received management servers list", e);
932935
}
933936
}
937+
shell.setAvoidHosts(avoidMsList);
934938
if ("shuffle".equals(lbAlgorithm)) {
935939
scheduleHostLBCheckerTask(0);
936940
} else {
@@ -939,16 +943,18 @@ private void processManagementServerList(final List<String> msList, final String
939943
}
940944

941945
private Answer setupManagementServerList(final SetupMSListCommand cmd) {
942-
processManagementServerList(cmd.getMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
946+
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
943947
return new SetupMSListAnswer(true);
944948
}
945949

946950
private Answer migrateAgentToOtherMS(final MigrateAgentConnectionCommand cmd) {
947951
try {
948952
if (CollectionUtils.isNotEmpty(cmd.getMsList())) {
949-
processManagementServerList(cmd.getMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
953+
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
950954
}
951-
migrateAgentConnection(cmd.getAvoidMsList());
955+
Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("MigrateAgentConnection-Job")).schedule(() -> {
956+
migrateAgentConnection(cmd.getAvoidMsList());
957+
}, 3, TimeUnit.SECONDS);
952958
} catch (Exception e) {
953959
String errMsg = "Migrate agent connection failed, due to " + e.getMessage();
954960
logger.debug(errMsg, e);
@@ -969,25 +975,26 @@ private void migrateAgentConnection(List<String> avoidMsList) {
969975
throw new CloudRuntimeException("No other Management Server hosts to migrate");
970976
}
971977

972-
String preferredHost = null;
978+
String preferredMSHost = null;
973979
for (String msHost : msHostsList) {
974980
try (final Socket socket = new Socket()) {
975-
socket.connect(new InetSocketAddress(msHost, _shell.getPort()), 5000);
976-
preferredHost = msHost;
981+
socket.connect(new InetSocketAddress(msHost, shell.getPort()), 5000);
982+
preferredMSHost = msHost;
977983
break;
978984
} catch (final IOException e) {
979985
throw new CloudRuntimeException("Management server host: " + msHost + " is not reachable, to migrate connection");
980986
}
981987
}
982988

983-
if (preferredHost == null) {
989+
if (preferredMSHost == null) {
984990
throw new CloudRuntimeException("Management server host(s) are not reachable, to migrate connection");
985991
}
986992

987-
logger.debug("Management server host " + preferredHost + " is found to be reachable, trying to reconnect");
988-
_shell.resetHostCounter();
989-
_shell.setConnectionTransfer(true);
990-
reconnect(_link, preferredHost, avoidMsList, true);
993+
logger.debug("Management server host " + preferredMSHost + " is found to be reachable, trying to reconnect");
994+
shell.resetHostCounter();
995+
shell.setAvoidHosts(avoidMsList);
996+
shell.setConnectionTransfer(true);
997+
reconnect(link, preferredMSHost, true);
991998
}
992999

9931000
public void processResponse(final Response response, final Link link) {
@@ -1000,14 +1007,21 @@ public void processResponse(final Response response, final Link link) {
10001007
for (final IAgentControlListener listener : controlListeners) {
10011008
listener.processControlResponse(response, (AgentControlAnswer)answer);
10021009
}
1003-
} else if (answer instanceof PingAnswer && (((PingAnswer) answer).isSendStartup()) && reconnectAllowed) {
1004-
logger.info("Management server requested startup command to reinitialize the agent");
1005-
sendStartup(link);
1010+
} else if (answer instanceof PingAnswer) {
1011+
processPingAnswer((PingAnswer) answer);
10061012
} else {
10071013
updateLastPingResponseTime();
10081014
}
10091015
}
10101016

1017+
private void processPingAnswer(final PingAnswer answer) {
1018+
if ((answer.isSendStartup()) && reconnectAllowed) {
1019+
logger.info("Management server requested startup command to reinitialize the agent");
1020+
sendStartup(link);
1021+
}
1022+
shell.setAvoidHosts(answer.getAvoidMsList());
1023+
}
1024+
10111025
public void processReadyCommand(final Command cmd) {
10121026
final ReadyCommand ready = (ReadyCommand)cmd;
10131027
// Set human readable sizes;
@@ -1024,7 +1038,7 @@ public void processReadyCommand(final Command cmd) {
10241038
}
10251039

10261040
verifyAgentArch(ready.getArch());
1027-
processManagementServerList(ready.getMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval());
1041+
processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval());
10281042

10291043
logger.info("Ready command is processed for agent [id: {}, uuid: {}, name: {}]", getId(), getUuid(), getName());
10301044
}
@@ -1371,26 +1385,26 @@ protected void runInContext() {
13711385
if (msList == null || msList.length < 1) {
13721386
return;
13731387
}
1374-
final String preferredHost = msList[0];
1388+
final String preferredMSHost = msList[0];
13751389
final String connectedHost = shell.getConnectedHost();
13761390
logger.debug("Running preferred host checker task, connected host={}, preferred host={}",
1377-
connectedHost, preferredHost);
1378-
if (preferredHost == null || preferredHost.equals(connectedHost) || link == null) {
1391+
connectedHost, preferredMSHost);
1392+
if (preferredMSHost == null || preferredMSHost.equals(connectedHost) || link == null) {
13791393
return;
13801394
}
13811395
boolean isHostUp = false;
13821396
try (final Socket socket = new Socket()) {
1383-
socket.connect(new InetSocketAddress(preferredHost, shell.getPort()), 5000);
1397+
socket.connect(new InetSocketAddress(preferredMSHost, shell.getPort()), 5000);
13841398
isHostUp = true;
13851399
} catch (final IOException e) {
1386-
logger.debug("Host: {} is not reachable", preferredHost);
1400+
logger.debug("Host: {} is not reachable", preferredMSHost);
13871401
}
13881402
if (isHostUp && link != null && commandsInProgress.get() == 0) {
13891403
if (logger.isDebugEnabled()) {
1390-
logger.debug("Preferred host {} is found to be reachable, trying to reconnect", preferredHost);
1404+
logger.debug("Preferred host {} is found to be reachable, trying to reconnect", preferredMSHost);
13911405
}
13921406
shell.resetHostCounter();
1393-
reconnect(link);
1407+
reconnect(link, preferredMSHost, false);
13941408
}
13951409
} catch (Throwable t) {
13961410
logger.error("Error caught while attempting to connect to preferred host", t);

agent/src/main/java/com/cloud/agent/AgentShell.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ public class AgentShell implements IAgentShell, Daemon {
6666
private String _zone;
6767
private String _pod;
6868
private String _host;
69+
private List<String> _avoidHosts;
6970
private String _privateIp;
7071
private int _port;
7172
private int _proxyPort;
@@ -76,7 +77,6 @@ public class AgentShell implements IAgentShell, Daemon {
7677
private volatile boolean _exit = false;
7778
private int _pingRetries;
7879
private final List<Agent> _agents = new ArrayList<Agent>();
79-
private String hostToConnect;
8080
private String connectedHost;
8181
private Long preferredHostCheckInterval;
8282
private boolean connectionTransfer = false;
@@ -121,7 +121,7 @@ public String getNextHost() {
121121
if (_hostCounter >= hosts.length) {
122122
_hostCounter = 0;
123123
}
124-
hostToConnect = hosts[_hostCounter % hosts.length];
124+
String hostToConnect = hosts[_hostCounter % hosts.length];
125125
_hostCounter++;
126126
return hostToConnect;
127127
}
@@ -143,11 +143,10 @@ public long getLbCheckerInterval(final Long receivedLbInterval) {
143143
}
144144

145145
@Override
146-
public void updateConnectedHost() {
147-
connectedHost = hostToConnect;
146+
public void updateConnectedHost(String connectedHost) {
147+
this.connectedHost = connectedHost;
148148
}
149149

150-
151150
@Override
152151
public void resetHostCounter() {
153152
_hostCounter = 0;
@@ -166,6 +165,16 @@ public void setHosts(final String host) {
166165
}
167166
}
168167

168+
@Override
169+
public void setAvoidHosts(List<String> avoidHosts) {
170+
_avoidHosts = avoidHosts;
171+
}
172+
173+
@Override
174+
public List<String> getAvoidHosts() {
175+
return _avoidHosts;
176+
}
177+
169178
@Override
170179
public String getPrivateIp() {
171180
return _privateIp;

agent/src/main/java/com/cloud/agent/IAgentShell.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// under the License.
1717
package com.cloud.agent;
1818

19+
import java.util.List;
1920
import java.util.Map;
2021
import java.util.Properties;
2122

@@ -63,9 +64,13 @@ public interface IAgentShell {
6364

6465
String[] getHosts();
6566

67+
void setAvoidHosts(List<String> hosts);
68+
69+
List<String> getAvoidHosts();
70+
6671
long getLbCheckerInterval(Long receivedLbInterval);
6772

68-
void updateConnectedHost();
73+
void updateConnectedHost(String connectedHost);
6974

7075
String getConnectedHost();
7176

agent/src/main/java/com/cloud/agent/properties/AgentProperties.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -828,7 +828,7 @@ public Property<Integer> getWorkers() {
828828
* Data type: Integer.<br>
829829
* Default value: <code>null</code>
830830
*/
831-
public static final Property<Integer> SSL_HANDSHAKE_TIMEOUT = new Property<>("ssl.handshake.timeout", null, Integer.class);
831+
public static final Property<Integer> SSL_HANDSHAKE_TIMEOUT = new Property<>("ssl.handshake.timeout", 30, Integer.class);
832832

833833
public static class Property <T>{
834834
private String name;

agent/src/test/java/com/cloud/agent/AgentShellTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ public void updateAndGetConnectedHost() {
358358
AgentShell shell = new AgentShell();
359359
shell.setHosts("test");
360360
shell.getNextHost();
361-
shell.updateConnectedHost();
361+
shell.updateConnectedHost("test");
362362

363363
Assert.assertEquals(expected, shell.getConnectedHost());
364364
}

api/src/main/java/com/cloud/event/EventTypes.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,13 @@ public class EventTypes {
748748
public static final String EVENT_SECURITY_CHECK = "SECURITY.CHECK";
749749
public static final String EVENT_INTEGRITY_VERIFICATION = "INTEGRITY.VERIFICATION";
750750
public static final String EVENT_LOG_AUTO_DELETED = "LOG.DELETED";
751+
752+
// Management Server
753+
public static final String EVENT_MS_MAINTENANCE_PREPARE = "MS.MAINTENANCE.PREPARE";
754+
public static final String EVENT_MS_MAINTENANCE_CANCEL = "MS.MAINTENANCE.CANCEL";
755+
public static final String EVENT_MS_SHUTDOWN_PREPARE = "MS.SHUTDOWN.PREPARE";
756+
public static final String EVENT_MS_SHUTDOWN_CANCEL = "MS.SHUTDOWN.CANCEL";
757+
public static final String EVENT_MS_SHUTDOWN = "MS.SHUTDOWN";
751758

752759
// OBJECT STORE
753760
public static final String EVENT_OBJECT_STORE_CREATE = "OBJECT.STORE.CREATE";
@@ -1249,6 +1256,12 @@ public class EventTypes {
12491256

12501257
//Security
12511258
entityEventDetails.put(EVENT_SECURITY_CHECK, "Security");
1259+
1260+
entityEventDetails.put(EVENT_MS_MAINTENANCE_PREPARE, "ManagementServer");
1261+
entityEventDetails.put(EVENT_MS_MAINTENANCE_CANCEL, "ManagementServer");
1262+
entityEventDetails.put(EVENT_MS_SHUTDOWN_PREPARE, "ManagementServer");
1263+
entityEventDetails.put(EVENT_MS_SHUTDOWN_CANCEL, "ManagementServer");
1264+
entityEventDetails.put(EVENT_MS_SHUTDOWN, "ManagementServer");
12521265

12531266
//Object Store
12541267
entityEventDetails.put(EVENT_OBJECT_STORE_CREATE, ObjectStore.class);

api/src/main/java/org/apache/cloudstack/api/ApiConstants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1218,6 +1218,7 @@ public class ApiConstants {
12181218
public static final String PENDING_JOBS_COUNT = "pendingjobscount";
12191219
public static final String AGENTS_COUNT = "agentscount";
12201220
public static final String AGENTS = "agents";
1221+
public static final String LAST_AGENTS = "lastagents";
12211222

12221223
public static final String PUBLIC_MTU = "publicmtu";
12231224
public static final String PRIVATE_MTU = "privatemtu";

api/src/main/java/org/apache/cloudstack/api/ApiErrorCode.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ public enum ApiErrorCode {
3030
UNSUPPORTED_ACTION_ERROR(432),
3131
API_LIMIT_EXCEED(429),
3232

33+
SERVICE_UNAVAILABLE(503),
3334
INTERNAL_ERROR(530),
3435
ACCOUNT_ERROR(531),
3536
ACCOUNT_RESOURCE_LIMIT_ERROR(532),

api/src/main/java/org/apache/cloudstack/api/response/ManagementServerResponse.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,14 @@ public class ManagementServerResponse extends BaseResponse {
8282
@Param(description = "the Management Server Peers")
8383
private List<PeerManagementServerNodeResponse> peers;
8484

85+
@SerializedName(ApiConstants.LAST_AGENTS)
86+
@Param(description = "the last agents this Management Server is responsible for, before shutdown or preparing for maintenance", since = "4.21.0.0")
87+
private List<String> lastAgents;
88+
89+
@SerializedName(ApiConstants.AGENTS)
90+
@Param(description = "the agents this Management Server is responsible for", since = "4.21.0.0")
91+
private List<String> agents;
92+
8593
@SerializedName(ApiConstants.AGENTS_COUNT)
8694
@Param(description = "the number of host agents this Management Server is responsible for", since = "4.21.0.0")
8795
private Long agentsCount;
@@ -134,6 +142,14 @@ public String getIpAddress() {
134142
return ipAddress;
135143
}
136144

145+
public List<String> getLastAgents() {
146+
return lastAgents;
147+
}
148+
149+
public List<String> getAgents() {
150+
return agents;
151+
}
152+
137153
public Long getAgentsCount() {
138154
return this.agentsCount;
139155
}
@@ -190,6 +206,14 @@ public void setIpAddress(String ipAddress) {
190206
this.ipAddress = ipAddress;
191207
}
192208

209+
public void setLastAgents(List<String> lastAgents) {
210+
this.lastAgents = lastAgents;
211+
}
212+
213+
public void setAgents(List<String> agents) {
214+
this.agents = agents;
215+
}
216+
193217
public void setAgentsCount(Long agentsCount) {
194218
this.agentsCount = agentsCount;
195219
}

0 commit comments

Comments
 (0)