Skip to content

Commit 4c3f29d

Browse files
Agent manager connection handling improvements (#11376)
* Agent manager connection handling improvements * Fix to send LB check interval in ready command
1 parent d7b7bd5 commit 4c3f29d

File tree

6 files changed

+103
-75
lines changed

6 files changed

+103
-75
lines changed

agent/src/main/java/com/cloud/agent/Agent.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ private void scheduleHostLBCheckerTask(final String lbAlgorithm, final long chec
475475
return;
476476
}
477477

478-
logger.info("Scheduling a recurring preferred host checker task with lb algorithm '{}' and host.lb.interval={} ms", lbAlgorithm, checkInterval);
478+
logger.info("Scheduling a recurring preferred host checker task with host.lb.interval={} ms", checkInterval);
479479
hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory(name)));
480480
hostLbCheckExecutor.scheduleAtFixedRate(new PreferredHostCheckerTask(), checkInterval, checkInterval,
481481
TimeUnit.MILLISECONDS);

core/src/main/java/com/cloud/agent/api/ReadyCommand.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,6 @@
2626
public class ReadyCommand extends Command {
2727
private String _details;
2828

29-
public ReadyCommand() {
30-
super();
31-
}
32-
3329
private Long dcId;
3430
private Long hostId;
3531
private String hostUuid;
@@ -41,6 +37,10 @@ public ReadyCommand() {
4137
private Boolean enableHumanReadableSizes;
4238
private String arch;
4339

40+
public ReadyCommand() {
41+
super();
42+
}
43+
4444
public ReadyCommand(Long dcId) {
4545
super();
4646
this.dcId = dcId;
@@ -95,7 +95,7 @@ public List<String> getAvoidMsHostList() {
9595
return avoidMsHostList;
9696
}
9797

98-
public void setAvoidMsHostList(List<String> msHostList) {
98+
public void setAvoidMsHostList(List<String> avoidMsHostList) {
9999
this.avoidMsHostList = avoidMsHostList;
100100
}
101101

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 89 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.util.HashMap;
2828
import java.util.List;
2929
import java.util.Map;
30+
import java.util.Objects;
3031
import java.util.Set;
3132
import java.util.concurrent.ConcurrentHashMap;
3233
import java.util.concurrent.ExecutorService;
@@ -758,15 +759,15 @@ public void notifyMonitorsOfNewlyAddedHost(long hostId) {
758759
}
759760
}
760761

761-
protected AgentAttache notifyMonitorsOfConnection(final AgentAttache attache, final StartupCommand[] cmd, final boolean forRebalance) throws ConnectionException {
762+
protected AgentAttache notifyMonitorsOfConnection(final AgentAttache attache, final StartupCommand[] cmds, final boolean forRebalance) throws ConnectionException {
762763
final long hostId = attache.getId();
763764
final HostVO host = _hostDao.findById(hostId);
764765
for (final Pair<Integer, Listener> monitor : _hostMonitors) {
765766
logger.debug("Sending Connect to listener: {}, for rebalance: {}", monitor.second().getClass().getSimpleName(), forRebalance);
766-
for (int i = 0; i < cmd.length; i++) {
767+
for (StartupCommand cmd : cmds) {
767768
try {
768-
logger.debug("process connection to issue: {} for host: {}, forRebalance: {}, connection transferred: {}", ReflectionToStringBuilderUtils.reflectCollection(cmd[i]), hostId, forRebalance, cmd[i].isConnectionTransferred());
769-
monitor.second().processConnect(host, cmd[i], forRebalance);
769+
logger.debug("process connection to issue: {} for host: {}, forRebalance: {}", ReflectionToStringBuilderUtils.reflectOnlySelectedFields(cmd, "id", "type", "msHostList", "connectionTransferred"), hostId, forRebalance);
770+
monitor.second().processConnect(host, cmd, forRebalance);
770771
} catch (final ConnectionException ce) {
771772
if (ce.isSetupError()) {
772773
logger.warn("Monitor {} says there is an error in the connect process for {} due to {}", monitor.second().getClass().getSimpleName(), hostId, ce.getMessage());
@@ -1040,39 +1041,50 @@ protected Status getNextStatusOnDisconnection(Host host, final Status.Event even
10401041

10411042
protected boolean handleDisconnectWithoutInvestigation(final AgentAttache attache, final Status.Event event, final boolean transitState, final boolean removeAgent) {
10421043
final long hostId = attache.getId();
1043-
1044+
final HostVO host = _hostDao.findById(hostId);
10441045
boolean result = false;
10451046
GlobalLock joinLock = getHostJoinLock(hostId);
1046-
if (joinLock.lock(60)) {
1047-
try {
1048-
logger.info("Host {} is disconnecting with event {}",
1049-
attache, event);
1050-
Status nextStatus;
1051-
final HostVO host = _hostDao.findById(hostId);
1052-
if (host == null) {
1053-
logger.warn("Can't find host with {} ({})", hostId, attache);
1054-
nextStatus = Status.Removed;
1055-
} else {
1056-
nextStatus = getNextStatusOnDisconnection(host, event);
1057-
caService.purgeHostCertificate(host);
1058-
}
1059-
logger.debug("Deregistering link for {} with state {}", attache, nextStatus);
1060-
1061-
removeAgent(attache, nextStatus);
1062-
1063-
if (host != null && transitState) {
1064-
// update the state for host in DB as per the event
1065-
disconnectAgent(host, event, _nodeId);
1066-
}
1067-
} finally {
1068-
joinLock.unlock();
1047+
try {
1048+
if (!joinLock.lock(60)) {
1049+
logger.debug("Unable to acquire lock on host {} to process agent disconnection", Objects.toString(host, String.valueOf(hostId)));
1050+
return result;
10691051
}
1052+
1053+
logger.debug("Acquired lock on host {}, to process agent disconnection", Objects.toString(host, String.valueOf(hostId)));
1054+
disconnectHostAgent(attache, event, host, transitState, joinLock);
10701055
result = true;
1056+
} finally {
1057+
joinLock.releaseRef();
10711058
}
1072-
joinLock.releaseRef();
1059+
10731060
return result;
10741061
}
10751062

1063+
private void disconnectHostAgent(final AgentAttache attache, final Status.Event event, final HostVO host, final boolean transitState, final GlobalLock joinLock) {
1064+
try {
1065+
logger.info("Host {} is disconnecting with event {}", attache, event);
1066+
final long hostId = attache.getId();
1067+
Status nextStatus;
1068+
if (host == null) {
1069+
logger.warn("Can't find host with {} ({})", hostId, attache);
1070+
nextStatus = Status.Removed;
1071+
} else {
1072+
nextStatus = getNextStatusOnDisconnection(host, event);
1073+
caService.purgeHostCertificate(host);
1074+
}
1075+
logger.debug("Deregistering link for {} with state {}", attache, nextStatus);
1076+
1077+
removeAgent(attache, nextStatus);
1078+
1079+
if (host != null && transitState) {
1080+
// update the state for host in DB as per the event
1081+
disconnectAgent(host, event, _nodeId);
1082+
}
1083+
} finally {
1084+
joinLock.unlock();
1085+
}
1086+
}
1087+
10761088
protected boolean handleDisconnectWithInvestigation(final AgentAttache attache, Status.Event event) {
10771089
final long hostId = attache.getId();
10781090
HostVO host = _hostDao.findById(hostId);
@@ -1341,45 +1353,58 @@ protected AgentAttache createAttacheForConnect(final HostVO host, final Link lin
13411353
return attache;
13421354
}
13431355

1344-
private AgentAttache sendReadyAndGetAttache(HostVO host, ReadyCommand ready, Link link, StartupCommand[] startup) throws ConnectionException {
1345-
final List<String> agentMSHostList = new ArrayList<>();
1346-
String lbAlgorithm = null;
1347-
if (startup != null && startup.length > 0) {
1348-
final String agentMSHosts = startup[0].getMsHostList();
1349-
if (StringUtils.isNotEmpty(agentMSHosts)) {
1350-
String[] msHosts = agentMSHosts.split("@");
1351-
if (msHosts.length > 1) {
1352-
lbAlgorithm = msHosts[1];
1353-
}
1354-
agentMSHostList.addAll(Arrays.asList(msHosts[0].split(",")));
1355-
}
1356-
}
1357-
ready.setArch(host.getArch().getType());
1356+
private AgentAttache sendReadyAndGetAttache(HostVO host, ReadyCommand ready, Link link, StartupCommand[] startupCmds) throws ConnectionException {
13581357
AgentAttache attache;
13591358
GlobalLock joinLock = getHostJoinLock(host.getId());
1360-
if (joinLock.lock(60)) {
1361-
try {
1359+
try {
1360+
if (!joinLock.lock(60)) {
1361+
throw new ConnectionException(true, String.format("Unable to acquire lock on host %s, to process agent connection", host));
1362+
}
1363+
1364+
logger.debug("Acquired lock on host {}, to process agent connection", host);
1365+
attache = connectHostAgent(host, ready, link, startupCmds, joinLock);
1366+
} finally {
1367+
joinLock.releaseRef();
1368+
}
13621369

1363-
if (!indirectAgentLB.compareManagementServerList(host.getId(), host.getDataCenterId(), agentMSHostList, lbAlgorithm)) {
1364-
final List<String> newMSList = indirectAgentLB.getManagementServerList(host.getId(), host.getDataCenterId(), null);
1365-
ready.setMsHostList(newMSList);
1366-
final List<String> avoidMsList = _mshostDao.listNonUpStateMsIPs();
1367-
ready.setAvoidMsHostList(avoidMsList);
1368-
ready.setLbAlgorithm(indirectAgentLB.getLBAlgorithmName());
1369-
ready.setLbCheckInterval(indirectAgentLB.getLBPreferredHostCheckInterval(host.getClusterId()));
1370-
logger.debug("Agent's management server host list is not up to date, sending list update: {}", newMSList);
1370+
return attache;
1371+
}
1372+
1373+
private AgentAttache connectHostAgent(HostVO host, ReadyCommand ready, Link link, StartupCommand[] startupCmds, GlobalLock joinLock) throws ConnectionException {
1374+
AgentAttache attache;
1375+
try {
1376+
final List<String> agentMSHostList = new ArrayList<>();
1377+
String lbAlgorithm = null;
1378+
if (startupCmds != null && startupCmds.length > 0) {
1379+
final String agentMSHosts = startupCmds[0].getMsHostList();
1380+
if (StringUtils.isNotEmpty(agentMSHosts)) {
1381+
String[] msHosts = agentMSHosts.split("@");
1382+
if (msHosts.length > 1) {
1383+
lbAlgorithm = msHosts[1];
1384+
}
1385+
agentMSHostList.addAll(Arrays.asList(msHosts[0].split(",")));
13711386
}
1387+
}
13721388

1373-
attache = createAttacheForConnect(host, link);
1374-
attache = notifyMonitorsOfConnection(attache, startup, false);
1375-
} finally {
1376-
joinLock.unlock();
1389+
if (!indirectAgentLB.compareManagementServerListAndLBAlgorithm(host.getId(), host.getDataCenterId(), agentMSHostList, lbAlgorithm)) {
1390+
final List<String> newMSList = indirectAgentLB.getManagementServerList(host.getId(), host.getDataCenterId(), null);
1391+
ready.setMsHostList(newMSList);
1392+
String newLBAlgorithm = indirectAgentLB.getLBAlgorithmName();
1393+
ready.setLbAlgorithm(newLBAlgorithm);
1394+
logger.debug("Agent's management server host list or lb algorithm is not up to date, sending list and algorithm update: {}, {}", newMSList, newLBAlgorithm);
13771395
}
1378-
} else {
1379-
throw new ConnectionException(true,
1380-
String.format("Unable to acquire lock on host %s", host));
1396+
1397+
final List<String> avoidMsList = _mshostDao.listNonUpStateMsIPs();
1398+
ready.setAvoidMsHostList(avoidMsList);
1399+
ready.setLbCheckInterval(indirectAgentLB.getLBPreferredHostCheckInterval(host.getClusterId()));
1400+
ready.setArch(host.getArch().getType());
1401+
1402+
attache = createAttacheForConnect(host, link);
1403+
attache = notifyMonitorsOfConnection(attache, startupCmds, false);
1404+
} finally {
1405+
joinLock.unlock();
13811406
}
1382-
joinLock.releaseRef();
1407+
13831408
return attache;
13841409
}
13851410

@@ -1666,7 +1691,7 @@ protected void processRequest(final Link link, final Request request) {
16661691
logger.debug("Not processing {} for agent id={}; can't find the host in the DB", PingRoutingCommand.class.getSimpleName(), cmdHostId);
16671692
}
16681693
}
1669-
if (host!= null && host.getStatus() != Status.Up && gatewayAccessible) {
1694+
if (host != null && host.getStatus() != Status.Up && gatewayAccessible) {
16701695
requestStartupCommand = true;
16711696
}
16721697
final List<String> avoidMsList = _mshostDao.listNonUpStateMsIPs();
@@ -1821,11 +1846,11 @@ protected boolean isHostOwnerSwitched(final HostVO host) {
18211846
return false;
18221847
}
18231848

1824-
private void disconnectInternal(final long hostId, final Status.Event event, final boolean invstigate) {
1849+
private void disconnectInternal(final long hostId, final Status.Event event, final boolean investigate) {
18251850
final AgentAttache attache = findAttache(hostId);
18261851

18271852
if (attache != null) {
1828-
if (!invstigate) {
1853+
if (!investigate) {
18291854
disconnectWithoutInvestigation(attache, event);
18301855
} else {
18311856
disconnectWithInvestigation(attache, event);

engine/orchestration/src/main/java/com/cloud/agent/manager/ConnectedAgentAttache.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,10 @@ public synchronized boolean isClosed() {
5454
@Override
5555
public void disconnect(final Status state) {
5656
synchronized (this) {
57-
logger.debug("Processing Disconnect.");
57+
logger.debug("Processing disconnect [id: {}, uuid: {}, name: {}]", _id, _uuid, _name);
58+
5859
if (_link != null) {
60+
logger.debug("Disconnecting from {}, Socket Address: {}", _link.getIpAddress(), _link.getSocketAddress());
5961
_link.close();
6062
_link.terminated();
6163
}

framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,14 @@ public interface IndirectAgentLB {
4848
List<String> getManagementServerList(Long hostId, Long dcId, List<Long> orderedHostIdList, String lbAlgorithm);
4949

5050
/**
51-
* Compares received management server list against expected list for a host in a zone.
51+
* Compares received management server list against expected list for a host in a zone and LB algorithm.
5252
* @param hostId host id
5353
* @param dcId zone id
5454
* @param receivedMSHosts received management server list
55-
* @return true if mgmtHosts is up to date, false if not
55+
* @param lbAlgorithm received LB algorithm
56+
* @return true if mgmtHosts and LB algorithm are up to date, false if not
5657
*/
57-
boolean compareManagementServerList(Long hostId, Long dcId, List<String> receivedMSHosts, String lbAlgorithm);
58+
boolean compareManagementServerListAndLBAlgorithm(Long hostId, Long dcId, List<String> receivedMSHosts, String lbAlgorithm);
5859

5960
/**
6061
* Returns the configure LB algorithm

server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ public List<String> getManagementServerList(final Long hostId, final Long dcId,
149149
}
150150

151151
@Override
152-
public boolean compareManagementServerList(final Long hostId, final Long dcId, final List<String> receivedMSHosts, final String lbAlgorithm) {
152+
public boolean compareManagementServerListAndLBAlgorithm(final Long hostId, final Long dcId, final List<String> receivedMSHosts, final String lbAlgorithm) {
153153
if (receivedMSHosts == null || receivedMSHosts.isEmpty()) {
154154
return false;
155155
}

0 commit comments

Comments
 (0)