Skip to content

Commit 44e1eb6

Browse files
committed
add checks, at agent side as well
Signed-off-by: Abhishek Kumar <[email protected]>
1 parent 4e622f3 commit 44e1eb6

File tree

3 files changed

+35
-17
lines changed

3 files changed

+35
-17
lines changed

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,6 @@
3838
import javax.inject.Inject;
3939
import javax.naming.ConfigurationException;
4040

41-
import com.cloud.configuration.Config;
42-
import com.cloud.utils.NumbersUtil;
43-
import com.cloud.utils.db.GlobalLock;
4441
import org.apache.cloudstack.agent.lb.IndirectAgentLB;
4542
import org.apache.cloudstack.ca.CAManager;
4643
import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService;
@@ -54,6 +51,7 @@
5451
import org.apache.cloudstack.utils.identity.ManagementServerNode;
5552
import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils;
5653
import org.apache.commons.lang3.BooleanUtils;
54+
import org.apache.commons.lang3.StringUtils;
5755
import org.apache.log4j.Logger;
5856
import org.apache.log4j.MDC;
5957

@@ -82,6 +80,7 @@
8280
import com.cloud.agent.transport.Request;
8381
import com.cloud.agent.transport.Response;
8482
import com.cloud.alert.AlertManager;
83+
import com.cloud.configuration.Config;
8584
import com.cloud.configuration.ManagementServiceConfiguration;
8685
import com.cloud.dc.ClusterVO;
8786
import com.cloud.dc.DataCenterVO;
@@ -105,11 +104,13 @@
105104
import com.cloud.resource.ResourceManager;
106105
import com.cloud.resource.ResourceState;
107106
import com.cloud.resource.ServerResource;
107+
import com.cloud.utils.NumbersUtil;
108108
import com.cloud.utils.Pair;
109109
import com.cloud.utils.component.ManagerBase;
110110
import com.cloud.utils.concurrency.NamedThreadFactory;
111111
import com.cloud.utils.db.DB;
112112
import com.cloud.utils.db.EntityManager;
113+
import com.cloud.utils.db.GlobalLock;
113114
import com.cloud.utils.db.QueryBuilder;
114115
import com.cloud.utils.db.SearchCriteria.Op;
115116
import com.cloud.utils.db.TransactionLegacy;
@@ -124,7 +125,6 @@
124125
import com.cloud.utils.nio.NioServer;
125126
import com.cloud.utils.nio.Task;
126127
import com.cloud.utils.time.InaccurateClock;
127-
import org.apache.commons.lang3.StringUtils;
128128

129129
/**
130130
* Implementation of the Agent Manager. This class controls the connection to the agents.
@@ -208,6 +208,11 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
208208
protected final ConfigKey<Boolean> CheckTxnBeforeSending = new ConfigKey<Boolean>("Developer", Boolean.class, "check.txn.before.sending.agent.commands", "false",
209209
"This parameter allows developers to enable a check to see if a transaction wraps commands that are sent to the resource. This is not to be enabled on production systems.", true);
210210

211+
public static final List<Host.Type> HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES = Arrays.asList(
212+
Host.Type.SecondaryStorage,
213+
Host.Type.ConsoleProxy
214+
);
215+
211216
@Override
212217
public boolean configure(final String name, final Map<String, Object> params) throws ConfigurationException {
213218

@@ -901,7 +906,10 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache,
901906
if (determinedState == Status.Down) {
902907
final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
903908
s_logger.error(message);
904-
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
909+
if (Status.Down.equals(host.getStatus())) {
910+
s_logger.debug(String.format("Skipping sending alert for %s as it already in %s state",
911+
host, host.getStatus()));
912+
} else if (!HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES.contains(host.getType())) {
905913
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
906914
}
907915
event = Status.Event.HostDown;

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.HashMap;
2222
import java.util.List;
2323
import java.util.Map;
24+
import java.util.Optional;
2425
import java.util.concurrent.Executors;
2526
import java.util.concurrent.ScheduledExecutorService;
2627
import java.util.concurrent.TimeUnit;
@@ -41,6 +42,7 @@
4142
import org.apache.cloudstack.managed.context.ManagedContext;
4243
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
4344
import org.apache.cloudstack.management.ManagementServerHost;
45+
import org.apache.commons.collections.CollectionUtils;
4446
import org.apache.log4j.Logger;
4547
import org.apache.log4j.NDC;
4648

@@ -71,7 +73,6 @@
7173
import com.cloud.network.VpcVirtualNetworkApplianceService;
7274
import com.cloud.resource.ResourceManager;
7375
import com.cloud.server.ManagementServer;
74-
import com.cloud.service.ServiceOfferingVO;
7576
import com.cloud.service.dao.ServiceOfferingDao;
7677
import com.cloud.storage.Storage.StoragePoolType;
7778
import com.cloud.storage.StorageManager;
@@ -271,35 +272,43 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
271272
// send an email alert that the host is down
272273
StringBuilder sb = null;
273274
List<VMInstanceVO> reorderedVMList = new ArrayList<VMInstanceVO>();
274-
if ((vms != null) && !vms.isEmpty()) {
275+
int skippedHAVms = 0;
276+
if (CollectionUtils.isNotEmpty(vms)) {
275277
sb = new StringBuilder();
276278
sb.append(" Starting HA on the following VMs:");
277279
// collect list of vm names for the alert email
278-
for (int i = 0; i < vms.size(); i++) {
279-
VMInstanceVO vm = vms.get(i);
280+
for (VMInstanceVO vm : vms) {
281+
Optional<HaWorkVO> item = pendingHaWorks.stream()
282+
.filter(h -> h.getInstanceId() == vm.getId())
283+
.reduce((first, second) -> second);
284+
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
285+
!item.get().canScheduleNew(_timeBetweenFailures))) {
286+
s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
287+
skippedHAVms++;
288+
continue;
289+
}
280290
if (vm.getType() == VirtualMachine.Type.User) {
281291
reorderedVMList.add(vm);
282292
} else {
283293
reorderedVMList.add(0, vm);
284294
}
285295
if (vm.isHaEnabled()) {
286-
sb.append(" " + vm.getHostName());
296+
sb.append(" ").append(vm.getHostName());
287297
}
288298
}
289299
}
290-
300+
if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) {
301+
s_logger.debug(String.format(
302+
"Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host));
303+
return;
304+
}
291305
// send an email alert that the host is down, include VMs
292306
HostPodVO podVO = _podDao.findById(host.getPodId());
293307
String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
294308
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc,
295309
"Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : ""));
296310

297311
for (VMInstanceVO vm : reorderedVMList) {
298-
VMInstanceVO finalVm = vm;
299-
if (pendingHaWorks.stream().anyMatch(h -> h.getInstanceId() == finalVm.getId())) {
300-
continue;
301-
}
302-
ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId());
303312
if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) {
304313
if (s_logger.isDebugEnabled()){
305314
s_logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host.");

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import java.util.Date;
2020
import java.util.List;
2121

22-
2322
import org.apache.log4j.Logger;
2423
import org.springframework.stereotype.Component;
2524

@@ -266,9 +265,11 @@ public List<HaWorkVO> listPendingHAWorkForHost(long hostId) {
266265
SearchBuilder<HaWorkVO> sb = createSearchBuilder();
267266
sb.and("hostId", sb.entity().getHostId(), Op.EQ);
268267
sb.and("type", sb.entity().getWorkType(), Op.EQ);
268+
sb.and("step", sb.entity().getStep(), Op.NIN);
269269
SearchCriteria<HaWorkVO> sc = sb.create();
270270
sc.setParameters("hostId", hostId);
271271
sc.setParameters("type", WorkType.HA);
272+
sc.setParameters("step", Step.Done, Step.Cancelled, Step.Error);
272273
return listBy(sc);
273274
}
274275
}

0 commit comments

Comments
 (0)