Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@
import javax.inject.Inject;
import javax.naming.ConfigurationException;

import com.cloud.configuration.Config;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.db.GlobalLock;
import org.apache.cloudstack.agent.lb.IndirectAgentLB;
import org.apache.cloudstack.ca.CAManager;
import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService;
Expand All @@ -54,6 +51,7 @@
import org.apache.cloudstack.utils.identity.ManagementServerNode;
import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.MDC;

Expand Down Expand Up @@ -82,6 +80,7 @@
import com.cloud.agent.transport.Request;
import com.cloud.agent.transport.Response;
import com.cloud.alert.AlertManager;
import com.cloud.configuration.Config;
import com.cloud.configuration.ManagementServiceConfiguration;
import com.cloud.dc.ClusterVO;
import com.cloud.dc.DataCenterVO;
Expand All @@ -105,11 +104,13 @@
import com.cloud.resource.ResourceManager;
import com.cloud.resource.ResourceState;
import com.cloud.resource.ServerResource;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.Pair;
import com.cloud.utils.component.ManagerBase;
import com.cloud.utils.concurrency.NamedThreadFactory;
import com.cloud.utils.db.DB;
import com.cloud.utils.db.EntityManager;
import com.cloud.utils.db.GlobalLock;
import com.cloud.utils.db.QueryBuilder;
import com.cloud.utils.db.SearchCriteria.Op;
import com.cloud.utils.db.TransactionLegacy;
Expand All @@ -124,7 +125,6 @@
import com.cloud.utils.nio.NioServer;
import com.cloud.utils.nio.Task;
import com.cloud.utils.time.InaccurateClock;
import org.apache.commons.lang3.StringUtils;

/**
* Implementation of the Agent Manager. This class controls the connection to the agents.
Expand Down Expand Up @@ -208,6 +208,11 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
protected final ConfigKey<Boolean> CheckTxnBeforeSending = new ConfigKey<Boolean>("Developer", Boolean.class, "check.txn.before.sending.agent.commands", "false",
"This parameter allows developers to enable a check to see if a transaction wraps commands that are sent to the resource. This is not to be enabled on production systems.", true);

public static final List<Host.Type> HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES = Arrays.asList(
Host.Type.SecondaryStorage,
Host.Type.ConsoleProxy
);

@Override
public boolean configure(final String name, final Map<String, Object> params) throws ConfigurationException {

Expand Down Expand Up @@ -901,7 +906,10 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache,
if (determinedState == Status.Down) {
final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
s_logger.error(message);
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
if (Status.Down.equals(host.getStatus())) {
s_logger.debug(String.format("Skipping sending alert for %s as it already in %s state",
host, host.getStatus()));
} else if (!HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES.contains(host.getType())) {
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
}
event = Status.Event.HostDown;
Expand Down
28 changes: 21 additions & 7 deletions server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
Expand All @@ -41,6 +42,7 @@
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;

Expand Down Expand Up @@ -71,7 +73,6 @@
import com.cloud.network.VpcVirtualNetworkApplianceService;
import com.cloud.resource.ResourceManager;
import com.cloud.server.ManagementServer;
import com.cloud.service.ServiceOfferingVO;
import com.cloud.service.dao.ServiceOfferingDao;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.storage.StorageManager;
Expand Down Expand Up @@ -265,36 +266,49 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
s_logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName());

final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
final List<HaWorkVO> pendingHaWorks = _haDao.listPendingHAWorkForHost(host.getId());
final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());

// send an email alert that the host is down
StringBuilder sb = null;
List<VMInstanceVO> reorderedVMList = new ArrayList<VMInstanceVO>();
if ((vms != null) && !vms.isEmpty()) {
int skippedHAVms = 0;
if (CollectionUtils.isNotEmpty(vms)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this block could go in its own method.

sb = new StringBuilder();
sb.append(" Starting HA on the following VMs:");
// collect list of vm names for the alert email
for (int i = 0; i < vms.size(); i++) {
VMInstanceVO vm = vms.get(i);
for (VMInstanceVO vm : vms) {
Optional<HaWorkVO> item = pendingHaWorks.stream()
.filter(h -> h.getInstanceId() == vm.getId())
.reduce((first, second) -> second);
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
!item.get().canScheduleNew(_timeBetweenFailures))) {
s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
skippedHAVms++;
continue;
}
if (vm.getType() == VirtualMachine.Type.User) {
reorderedVMList.add(vm);
} else {
reorderedVMList.add(0, vm);
}
if (vm.isHaEnabled()) {
sb.append(" " + vm.getHostName());
sb.append(" ").append(vm.getHostName());
}
}
}

if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) {
s_logger.debug(String.format(
"Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host));
return;
}
// send an email alert that the host is down, include VMs
HostPodVO podVO = _podDao.findById(host.getPodId());
String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc,
"Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : ""));

for (VMInstanceVO vm : reorderedVMList) {
ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId());
if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) {
if (s_logger.isDebugEnabled()){
s_logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
List<HaWorkVO> listPendingHaWorkForVm(long vmId);

List<HaWorkVO> listPendingMigrationsForVm(long vmId);

List<HaWorkVO> listPendingHAWorkForHost(long hostId);
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import java.util.Date;
import java.util.List;


import org.apache.log4j.Logger;
import org.springframework.stereotype.Component;

Expand Down Expand Up @@ -260,4 +259,17 @@ public int releaseWorkItems(long nodeId) {

return update(vo, sc);
}

@Override
public List<HaWorkVO> listPendingHAWorkForHost(long hostId) {
SearchBuilder<HaWorkVO> sb = createSearchBuilder();
sb.and("hostId", sb.entity().getHostId(), Op.EQ);
sb.and("type", sb.entity().getWorkType(), Op.EQ);
sb.and("step", sb.entity().getStep(), Op.NIN);
SearchCriteria<HaWorkVO> sc = sb.create();
sc.setParameters("hostId", hostId);
sc.setParameters("type", WorkType.HA);
sc.setParameters("step", Step.Done, Step.Cancelled, Step.Error);
return listBy(sc);
}
}
Loading