diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java index d2e86fbc4b9e..4b2578d20c45 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java @@ -38,9 +38,6 @@ import javax.inject.Inject; import javax.naming.ConfigurationException; -import com.cloud.configuration.Config; -import com.cloud.utils.NumbersUtil; -import com.cloud.utils.db.GlobalLock; import org.apache.cloudstack.agent.lb.IndirectAgentLB; import org.apache.cloudstack.ca.CAManager; import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService; @@ -54,6 +51,7 @@ import org.apache.cloudstack.utils.identity.ManagementServerNode; import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils; import org.apache.commons.lang3.BooleanUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.apache.log4j.MDC; @@ -82,6 +80,7 @@ import com.cloud.agent.transport.Request; import com.cloud.agent.transport.Response; import com.cloud.alert.AlertManager; +import com.cloud.configuration.Config; import com.cloud.configuration.ManagementServiceConfiguration; import com.cloud.dc.ClusterVO; import com.cloud.dc.DataCenterVO; @@ -105,11 +104,13 @@ import com.cloud.resource.ResourceManager; import com.cloud.resource.ResourceState; import com.cloud.resource.ServerResource; +import com.cloud.utils.NumbersUtil; import com.cloud.utils.Pair; import com.cloud.utils.component.ManagerBase; import com.cloud.utils.concurrency.NamedThreadFactory; import com.cloud.utils.db.DB; import com.cloud.utils.db.EntityManager; +import com.cloud.utils.db.GlobalLock; import com.cloud.utils.db.QueryBuilder; import com.cloud.utils.db.SearchCriteria.Op; import com.cloud.utils.db.TransactionLegacy; @@ -124,7 +125,6 @@ import com.cloud.utils.nio.NioServer; import com.cloud.utils.nio.Task; import com.cloud.utils.time.InaccurateClock; -import org.apache.commons.lang3.StringUtils; /** * Implementation of the Agent Manager. This class controls the connection to the agents. @@ -208,6 +208,11 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl protected final ConfigKey CheckTxnBeforeSending = new ConfigKey("Developer", Boolean.class, "check.txn.before.sending.agent.commands", "false", "This parameter allows developers to enable a check to see if a transaction wraps commands that are sent to the resource. This is not to be enabled on production systems.", true); + public static final List HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES = Arrays.asList( + Host.Type.SecondaryStorage, + Host.Type.ConsoleProxy + ); + @Override public boolean configure(final String name, final Map params) throws ConfigurationException { @@ -901,7 +906,10 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache, if (determinedState == Status.Down) { final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs"; s_logger.error(message); - if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) { + if (Status.Down.equals(host.getStatus())) { + s_logger.debug(String.format("Skipping sending alert for %s as it already in %s state", + host, host.getStatus())); + } else if (!HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES.contains(host.getType())) { _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message); } event = Status.Event.HostDown; diff --git a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java index 81ae44a97634..2716a0bceff9 100644 --- a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -21,6 +21,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -41,6 +42,7 @@ import org.apache.cloudstack.managed.context.ManagedContext; import org.apache.cloudstack.managed.context.ManagedContextRunnable; import org.apache.cloudstack.management.ManagementServerHost; +import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; import org.apache.log4j.NDC; @@ -71,7 +73,6 @@ import com.cloud.network.VpcVirtualNetworkApplianceService; import com.cloud.resource.ResourceManager; import com.cloud.server.ManagementServer; -import com.cloud.service.ServiceOfferingVO; import com.cloud.service.dao.ServiceOfferingDao; import com.cloud.storage.Storage.StoragePoolType; import com.cloud.storage.StorageManager; @@ -223,6 +224,18 @@ public void setHaPlanners(List haPlanners) { long _timeBetweenCleanups; String _haTag = null; + private boolean vmHasPendingHAJob(final List pendingHaWorks, final VMInstanceVO vm) { + Optional item = pendingHaWorks.stream() + .filter(h -> h.getInstanceId() == vm.getId()) + .reduce((first, second) -> second); + if (item.isPresent() && (item.get().getTimesTried() < _maxRetries || + !item.get().canScheduleNew(_timeBetweenFailures))) { + s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm)); + return true; + } + return false; + } + protected HighAvailabilityManagerImpl() { } @@ -265,28 +278,37 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) s_logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName()); final List vms = _instanceDao.listByHostId(host.getId()); + final List pendingHaWorks = _haDao.listPendingHAWorkForHost(host.getId()); final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); // send an email alert that the host is down StringBuilder sb = null; List reorderedVMList = new ArrayList(); - if ((vms != null) && !vms.isEmpty()) { + int skippedHAVms = 0; + if (CollectionUtils.isNotEmpty(vms)) { sb = new StringBuilder(); sb.append(" Starting HA on the following VMs:"); // collect list of vm names for the alert email - for (int i = 0; i < vms.size(); i++) { - VMInstanceVO vm = vms.get(i); + for (VMInstanceVO vm : vms) { + if (vmHasPendingHAJob(pendingHaWorks, vm)) { + skippedHAVms++; + continue; + } if (vm.getType() == VirtualMachine.Type.User) { reorderedVMList.add(vm); } else { reorderedVMList.add(0, vm); } if (vm.isHaEnabled()) { - sb.append(" " + vm.getHostName()); + sb.append(" ").append(vm.getHostName()); } } } - + if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) { + s_logger.debug(String.format( + "Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host)); + return; + } // send an email alert that the host is down, include VMs HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); @@ -294,7 +316,6 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) "Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : "")); for (VMInstanceVO vm : reorderedVMList) { - ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId()); if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) { if (s_logger.isDebugEnabled()){ s_logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host."); diff --git a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java index e8a3e17f8052..395b74e04645 100644 --- a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java +++ b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java @@ -85,4 +85,6 @@ public interface HighAvailabilityDao extends GenericDao { List listPendingHaWorkForVm(long vmId); List listPendingMigrationsForVm(long vmId); + + List listPendingHAWorkForHost(long hostId); } diff --git a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java index c7284053fb2e..7057ad84a42b 100644 --- a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java +++ b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java @@ -19,7 +19,6 @@ import java.util.Date; import java.util.List; - import org.apache.log4j.Logger; import org.springframework.stereotype.Component; @@ -260,4 +259,17 @@ public int releaseWorkItems(long nodeId) { return update(vo, sc); } + + @Override + public List listPendingHAWorkForHost(long hostId) { + SearchBuilder sb = createSearchBuilder(); + sb.and("hostId", sb.entity().getHostId(), Op.EQ); + sb.and("type", sb.entity().getWorkType(), Op.EQ); + sb.and("step", sb.entity().getStep(), Op.NIN); + SearchCriteria sc = sb.create(); + sc.setParameters("hostId", hostId); + sc.setParameters("type", WorkType.HA); + sc.setParameters("step", Step.Done, Step.Cancelled, Step.Error); + return listBy(sc); + } } diff --git a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java index 53e5a26849fc..f542aee5d6d3 100644 --- a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java +++ b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java @@ -62,7 +62,6 @@ import com.cloud.network.VpcVirtualNetworkApplianceService; import com.cloud.resource.ResourceManager; import com.cloud.server.ManagementServer; -import com.cloud.service.ServiceOfferingVO; import com.cloud.service.dao.ServiceOfferingDao; import com.cloud.storage.StorageManager; import com.cloud.storage.dao.GuestOSCategoryDao; @@ -214,7 +213,6 @@ public void scheduleRestartForVmsOnHostNonEmptyVMList() { Mockito.when(_dcDao.findById(Mockito.anyLong())).thenReturn(Mockito.mock(DataCenterVO.class)); Mockito.when(_haDao.findPreviousHA(Mockito.anyLong())).thenReturn(Arrays.asList(Mockito.mock(HaWorkVO.class))); Mockito.when(_haDao.persist((HaWorkVO)Mockito.anyObject())).thenReturn(Mockito.mock(HaWorkVO.class)); - Mockito.when(_serviceOfferingDao.findById(vm1.getServiceOfferingId())).thenReturn(Mockito.mock(ServiceOfferingVO.class)); highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true); }