Skip to content

Commit 0d9bc77

Browse files
shwstpprdhslove
authored andcommitted
server: prevent duplicate HA works and alerts (apache#10624)
Signed-off-by: Abhishek Kumar <[email protected]>
1 parent fba7c4a commit 0d9bc77

File tree

5 files changed

+55
-14
lines changed

5 files changed

+55
-14
lines changed

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,11 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
247247
protected final ConfigKey<Boolean> CheckTxnBeforeSending = new ConfigKey<Boolean>("Developer", Boolean.class, "check.txn.before.sending.agent.commands", "false",
248248
"This parameter allows developers to enable a check to see if a transaction wraps commands that are sent to the resource. This is not to be enabled on production systems.", true);
249249

250+
public static final List<Host.Type> HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES = Arrays.asList(
251+
Host.Type.SecondaryStorage,
252+
Host.Type.ConsoleProxy
253+
);
254+
250255
@Override
251256
public boolean configure(final String name, final Map<String, Object> params) throws ConfigurationException {
252257

@@ -1106,9 +1111,11 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache,
11061111
if (determinedState == Status.Down) {
11071112
final String message = String.format("Host %s is down. Starting HA on the VMs", host);
11081113
logger.error(message);
1109-
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
1110-
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(),
1111-
host.getPodId(), String.format("Host down, %s", host), message);
1114+
if (Status.Down.equals(host.getStatus())) {
1115+
logger.debug(String.format("Skipping sending alert for %s as it already in %s state",
1116+
host, host.getStatus()));
1117+
} else if (!HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES.contains(host.getType())) {
1118+
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
11121119
}
11131120
event = Status.Event.HostDown;
11141121
} else if (determinedState == Status.Up) {

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.util.HashMap;
2525
import java.util.List;
2626
import java.util.Map;
27+
import java.util.Optional;
2728
import java.util.concurrent.Executors;
2829
import java.util.concurrent.ScheduledExecutorService;
2930
import java.util.concurrent.TimeUnit;
@@ -45,6 +46,7 @@
4546
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
4647
import org.apache.cloudstack.management.ManagementServerHost;
4748
import org.apache.logging.log4j.ThreadContext;
49+
import org.apache.commons.collections.CollectionUtils;
4850

4951
import com.cloud.agent.AgentManager;
5052
import com.cloud.alert.AlertManager;
@@ -73,7 +75,6 @@
7375
import com.cloud.network.VpcVirtualNetworkApplianceService;
7476
import com.cloud.resource.ResourceManager;
7577
import com.cloud.server.ManagementServer;
76-
import com.cloud.service.ServiceOfferingVO;
7778
import com.cloud.service.dao.ServiceOfferingDao;
7879
import com.cloud.storage.Storage.StoragePoolType;
7980
import com.cloud.storage.StorageManager;
@@ -236,6 +237,18 @@ public void setHaPlanners(List<HAPlanner> haPlanners) {
236237
long _timeBetweenCleanups;
237238
String _haTag = null;
238239

240+
private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) {
241+
Optional<HaWorkVO> item = pendingHaWorks.stream()
242+
.filter(h -> h.getInstanceId() == vm.getId())
243+
.reduce((first, second) -> second);
244+
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
245+
!item.get().canScheduleNew(_timeBetweenFailures))) {
246+
s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
247+
return true;
248+
}
249+
return false;
250+
}
251+
239252
protected HighAvailabilityManagerImpl() {
240253
}
241254

@@ -295,36 +308,44 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate,
295308
logger.warn("Scheduling restart for VMs on host {}", host);
296309

297310
final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
311+
final List<HaWorkVO> pendingHaWorks = _haDao.listPendingHAWorkForHost(host.getId());
298312
final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
299313

300314
// send an email alert that the host is down
301315
StringBuilder sb = null;
302316
List<VMInstanceVO> reorderedVMList = new ArrayList<VMInstanceVO>();
303-
if ((vms != null) && !vms.isEmpty()) {
317+
int skippedHAVms = 0;
318+
if (CollectionUtils.isNotEmpty(vms)) {
304319
sb = new StringBuilder();
305320
sb.append(" Starting HA on the following VMs:");
306321
// collect list of vm names for the alert email
307-
for (int i = 0; i < vms.size(); i++) {
308-
VMInstanceVO vm = vms.get(i);
322+
for (VMInstanceVO vm : vms) {
323+
if (vmHasPendingHAJob(pendingHaWorks, vm)) {
324+
skippedHAVms++;
325+
continue;
326+
}
309327
if (vm.getType() == VirtualMachine.Type.User) {
310328
reorderedVMList.add(vm);
311329
} else {
312330
reorderedVMList.add(0, vm);
313331
}
314332
if (vm.isHaEnabled()) {
315-
sb.append(" " + vm.getHostName());
333+
sb.append(" ").append(vm.getHostName());
316334
}
317335
}
318336
}
319-
337+
if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) {
338+
s_logger.debug(String.format(
339+
"Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host));
340+
return;
341+
}
320342
// send an email alert that the host is down, include VMs
321343
HostPodVO podVO = _podDao.findById(host.getPodId());
322344
String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
323345
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc,
324346
"Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : ""));
325347

326348
for (VMInstanceVO vm : reorderedVMList) {
327-
ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId());
328349
if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) {
329350
if (logger.isDebugEnabled()){
330351
logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host.");

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,10 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
8585
List<HaWorkVO> listPendingHaWorkForVm(long vmId);
8686

8787
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
88+
89+
List<HaWorkVO> listPendingHAWorkForHost(long hostId);
90+
8891
int expungeByVmList(List<Long> vmIds, Long batchSize);
8992
void markPendingWorksAsInvestigating();
9093
void markServerPendingWorksAsInvestigating(long managementServerId);
91-
}
94+
}

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,18 @@ public int releaseWorkItems(long nodeId) {
261261
}
262262

263263
@Override
264+
public List<HaWorkVO> listPendingHAWorkForHost(long hostId) {
265+
SearchBuilder<HaWorkVO> sb = createSearchBuilder();
266+
sb.and("hostId", sb.entity().getHostId(), Op.EQ);
267+
sb.and("type", sb.entity().getWorkType(), Op.EQ);
268+
sb.and("step", sb.entity().getStep(), Op.NIN);
269+
SearchCriteria<HaWorkVO> sc = sb.create();
270+
sc.setParameters("hostId", hostId);
271+
sc.setParameters("type", WorkType.HA);
272+
sc.setParameters("step", Step.Done, Step.Cancelled, Step.Error);
273+
return listBy(sc);
274+
}
275+
264276
public int expungeByVmList(List<Long> vmIds, Long batchSize) {
265277
if (CollectionUtils.isEmpty(vmIds)) {
266278
return 0;

server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@
6565
import com.cloud.network.VpcVirtualNetworkApplianceService;
6666
import com.cloud.resource.ResourceManager;
6767
import com.cloud.server.ManagementServer;
68-
import com.cloud.service.ServiceOfferingVO;
6968
import com.cloud.service.dao.ServiceOfferingDao;
7069
import com.cloud.storage.StorageManager;
7170
import com.cloud.storage.dao.GuestOSCategoryDao;
@@ -236,8 +235,7 @@ public void scheduleRestartForVmsOnHostNonEmptyVMList() {
236235
Mockito.when(_podDao.findById(Mockito.anyLong())).thenReturn(Mockito.mock(HostPodVO.class));
237236
Mockito.when(_dcDao.findById(Mockito.anyLong())).thenReturn(Mockito.mock(DataCenterVO.class));
238237
Mockito.when(_haDao.findPreviousHA(Mockito.anyLong())).thenReturn(Arrays.asList(Mockito.mock(HaWorkVO.class)));
239-
Mockito.when(_haDao.persist((HaWorkVO)Mockito.any())).thenReturn(Mockito.mock(HaWorkVO.class));
240-
Mockito.when(_serviceOfferingDao.findById(vm1.getServiceOfferingId())).thenReturn(Mockito.mock(ServiceOfferingVO.class));
238+
Mockito.when(_haDao.persist((HaWorkVO)Mockito.anyObject())).thenReturn(Mockito.mock(HaWorkVO.class));
241239

242240
ConfigKey<Boolean> haEnabled = Mockito.mock(ConfigKey.class);
243241
highAvailabilityManager.VmHaEnabled = haEnabled;

0 commit comments

Comments
 (0)