Skip to content

Commit 919c979

Browse files
authored
server: prevent duplicate HA works and alerts (#10624)
Signed-off-by: Abhishek Kumar <[email protected]>
1 parent 1f8442e commit 919c979

File tree

5 files changed

+56
-15
lines changed

5 files changed

+56
-15
lines changed

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,6 @@
3838
import javax.inject.Inject;
3939
import javax.naming.ConfigurationException;
4040

41-
import com.cloud.configuration.Config;
42-
import com.cloud.utils.NumbersUtil;
43-
import com.cloud.utils.db.GlobalLock;
4441
import org.apache.cloudstack.agent.lb.IndirectAgentLB;
4542
import org.apache.cloudstack.ca.CAManager;
4643
import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService;
@@ -54,6 +51,7 @@
5451
import org.apache.cloudstack.utils.identity.ManagementServerNode;
5552
import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils;
5653
import org.apache.commons.lang3.BooleanUtils;
54+
import org.apache.commons.lang3.StringUtils;
5755
import org.apache.log4j.Logger;
5856
import org.apache.log4j.MDC;
5957

@@ -82,6 +80,7 @@
8280
import com.cloud.agent.transport.Request;
8381
import com.cloud.agent.transport.Response;
8482
import com.cloud.alert.AlertManager;
83+
import com.cloud.configuration.Config;
8584
import com.cloud.configuration.ManagementServiceConfiguration;
8685
import com.cloud.dc.ClusterVO;
8786
import com.cloud.dc.DataCenterVO;
@@ -105,11 +104,13 @@
105104
import com.cloud.resource.ResourceManager;
106105
import com.cloud.resource.ResourceState;
107106
import com.cloud.resource.ServerResource;
107+
import com.cloud.utils.NumbersUtil;
108108
import com.cloud.utils.Pair;
109109
import com.cloud.utils.component.ManagerBase;
110110
import com.cloud.utils.concurrency.NamedThreadFactory;
111111
import com.cloud.utils.db.DB;
112112
import com.cloud.utils.db.EntityManager;
113+
import com.cloud.utils.db.GlobalLock;
113114
import com.cloud.utils.db.QueryBuilder;
114115
import com.cloud.utils.db.SearchCriteria.Op;
115116
import com.cloud.utils.db.TransactionLegacy;
@@ -124,7 +125,6 @@
124125
import com.cloud.utils.nio.NioServer;
125126
import com.cloud.utils.nio.Task;
126127
import com.cloud.utils.time.InaccurateClock;
127-
import org.apache.commons.lang3.StringUtils;
128128

129129
/**
130130
* Implementation of the Agent Manager. This class controls the connection to the agents.
@@ -208,6 +208,11 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
208208
protected final ConfigKey<Boolean> CheckTxnBeforeSending = new ConfigKey<Boolean>("Developer", Boolean.class, "check.txn.before.sending.agent.commands", "false",
209209
"This parameter allows developers to enable a check to see if a transaction wraps commands that are sent to the resource. This is not to be enabled on production systems.", true);
210210

211+
public static final List<Host.Type> HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES = Arrays.asList(
212+
Host.Type.SecondaryStorage,
213+
Host.Type.ConsoleProxy
214+
);
215+
211216
@Override
212217
public boolean configure(final String name, final Map<String, Object> params) throws ConfigurationException {
213218

@@ -901,7 +906,10 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache,
901906
if (determinedState == Status.Down) {
902907
final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
903908
s_logger.error(message);
904-
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
909+
if (Status.Down.equals(host.getStatus())) {
910+
s_logger.debug(String.format("Skipping sending alert for %s as it already in %s state",
911+
host, host.getStatus()));
912+
} else if (!HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES.contains(host.getType())) {
905913
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
906914
}
907915
event = Status.Event.HostDown;

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.HashMap;
2222
import java.util.List;
2323
import java.util.Map;
24+
import java.util.Optional;
2425
import java.util.concurrent.Executors;
2526
import java.util.concurrent.ScheduledExecutorService;
2627
import java.util.concurrent.TimeUnit;
@@ -41,6 +42,7 @@
4142
import org.apache.cloudstack.managed.context.ManagedContext;
4243
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
4344
import org.apache.cloudstack.management.ManagementServerHost;
45+
import org.apache.commons.collections.CollectionUtils;
4446
import org.apache.log4j.Logger;
4547
import org.apache.log4j.NDC;
4648

@@ -71,7 +73,6 @@
7173
import com.cloud.network.VpcVirtualNetworkApplianceService;
7274
import com.cloud.resource.ResourceManager;
7375
import com.cloud.server.ManagementServer;
74-
import com.cloud.service.ServiceOfferingVO;
7576
import com.cloud.service.dao.ServiceOfferingDao;
7677
import com.cloud.storage.Storage.StoragePoolType;
7778
import com.cloud.storage.StorageManager;
@@ -223,6 +224,18 @@ public void setHaPlanners(List<HAPlanner> haPlanners) {
223224
long _timeBetweenCleanups;
224225
String _haTag = null;
225226

227+
private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) {
228+
Optional<HaWorkVO> item = pendingHaWorks.stream()
229+
.filter(h -> h.getInstanceId() == vm.getId())
230+
.reduce((first, second) -> second);
231+
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
232+
!item.get().canScheduleNew(_timeBetweenFailures))) {
233+
s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
234+
return true;
235+
}
236+
return false;
237+
}
238+
226239
protected HighAvailabilityManagerImpl() {
227240
}
228241

@@ -265,36 +278,44 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
265278
s_logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName());
266279

267280
final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
281+
final List<HaWorkVO> pendingHaWorks = _haDao.listPendingHAWorkForHost(host.getId());
268282
final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
269283

270284
// send an email alert that the host is down
271285
StringBuilder sb = null;
272286
List<VMInstanceVO> reorderedVMList = new ArrayList<VMInstanceVO>();
273-
if ((vms != null) && !vms.isEmpty()) {
287+
int skippedHAVms = 0;
288+
if (CollectionUtils.isNotEmpty(vms)) {
274289
sb = new StringBuilder();
275290
sb.append(" Starting HA on the following VMs:");
276291
// collect list of vm names for the alert email
277-
for (int i = 0; i < vms.size(); i++) {
278-
VMInstanceVO vm = vms.get(i);
292+
for (VMInstanceVO vm : vms) {
293+
if (vmHasPendingHAJob(pendingHaWorks, vm)) {
294+
skippedHAVms++;
295+
continue;
296+
}
279297
if (vm.getType() == VirtualMachine.Type.User) {
280298
reorderedVMList.add(vm);
281299
} else {
282300
reorderedVMList.add(0, vm);
283301
}
284302
if (vm.isHaEnabled()) {
285-
sb.append(" " + vm.getHostName());
303+
sb.append(" ").append(vm.getHostName());
286304
}
287305
}
288306
}
289-
307+
if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) {
308+
s_logger.debug(String.format(
309+
"Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host));
310+
return;
311+
}
290312
// send an email alert that the host is down, include VMs
291313
HostPodVO podVO = _podDao.findById(host.getPodId());
292314
String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
293315
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc,
294316
"Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : ""));
295317

296318
for (VMInstanceVO vm : reorderedVMList) {
297-
ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId());
298319
if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) {
299320
if (s_logger.isDebugEnabled()){
300321
s_logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host.");

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
8585
List<HaWorkVO> listPendingHaWorkForVm(long vmId);
8686

8787
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
88+
89+
List<HaWorkVO> listPendingHAWorkForHost(long hostId);
8890
}

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import java.util.Date;
2020
import java.util.List;
2121

22-
2322
import org.apache.log4j.Logger;
2423
import org.springframework.stereotype.Component;
2524

@@ -260,4 +259,17 @@ public int releaseWorkItems(long nodeId) {
260259

261260
return update(vo, sc);
262261
}
262+
263+
@Override
264+
public List<HaWorkVO> listPendingHAWorkForHost(long hostId) {
265+
SearchBuilder<HaWorkVO> sb = createSearchBuilder();
266+
sb.and("hostId", sb.entity().getHostId(), Op.EQ);
267+
sb.and("type", sb.entity().getWorkType(), Op.EQ);
268+
sb.and("step", sb.entity().getStep(), Op.NIN);
269+
SearchCriteria<HaWorkVO> sc = sb.create();
270+
sc.setParameters("hostId", hostId);
271+
sc.setParameters("type", WorkType.HA);
272+
sc.setParameters("step", Step.Done, Step.Cancelled, Step.Error);
273+
return listBy(sc);
274+
}
263275
}

server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@
6262
import com.cloud.network.VpcVirtualNetworkApplianceService;
6363
import com.cloud.resource.ResourceManager;
6464
import com.cloud.server.ManagementServer;
65-
import com.cloud.service.ServiceOfferingVO;
6665
import com.cloud.service.dao.ServiceOfferingDao;
6766
import com.cloud.storage.StorageManager;
6867
import com.cloud.storage.dao.GuestOSCategoryDao;
@@ -214,7 +213,6 @@ public void scheduleRestartForVmsOnHostNonEmptyVMList() {
214213
Mockito.when(_dcDao.findById(Mockito.anyLong())).thenReturn(Mockito.mock(DataCenterVO.class));
215214
Mockito.when(_haDao.findPreviousHA(Mockito.anyLong())).thenReturn(Arrays.asList(Mockito.mock(HaWorkVO.class)));
216215
Mockito.when(_haDao.persist((HaWorkVO)Mockito.anyObject())).thenReturn(Mockito.mock(HaWorkVO.class));
217-
Mockito.when(_serviceOfferingDao.findById(vm1.getServiceOfferingId())).thenReturn(Mockito.mock(ServiceOfferingVO.class));
218216

219217
highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true);
220218
}

0 commit comments

Comments
 (0)