|
24 | 24 | import java.util.HashMap; |
25 | 25 | import java.util.List; |
26 | 26 | import java.util.Map; |
| 27 | +import java.util.Optional; |
27 | 28 | import java.util.concurrent.Executors; |
28 | 29 | import java.util.concurrent.ScheduledExecutorService; |
29 | 30 | import java.util.concurrent.TimeUnit; |
|
45 | 46 | import org.apache.cloudstack.managed.context.ManagedContextRunnable; |
46 | 47 | import org.apache.cloudstack.management.ManagementServerHost; |
47 | 48 | import org.apache.logging.log4j.ThreadContext; |
| 49 | +import org.apache.commons.collections.CollectionUtils; |
48 | 50 |
|
49 | 51 | import com.cloud.agent.AgentManager; |
50 | 52 | import com.cloud.alert.AlertManager; |
|
73 | 75 | import com.cloud.network.VpcVirtualNetworkApplianceService; |
74 | 76 | import com.cloud.resource.ResourceManager; |
75 | 77 | import com.cloud.server.ManagementServer; |
76 | | -import com.cloud.service.ServiceOfferingVO; |
77 | 78 | import com.cloud.service.dao.ServiceOfferingDao; |
78 | 79 | import com.cloud.storage.Storage.StoragePoolType; |
79 | 80 | import com.cloud.storage.StorageManager; |
@@ -236,6 +237,18 @@ public void setHaPlanners(List<HAPlanner> haPlanners) { |
236 | 237 | long _timeBetweenCleanups; |
237 | 238 | String _haTag = null; |
238 | 239 |
|
| 240 | + private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) { |
| 241 | + Optional<HaWorkVO> item = pendingHaWorks.stream() |
| 242 | + .filter(h -> h.getInstanceId() == vm.getId()) |
| 243 | + .reduce((first, second) -> second); |
| 244 | + if (item.isPresent() && (item.get().getTimesTried() < _maxRetries || |
| 245 | + !item.get().canScheduleNew(_timeBetweenFailures))) { |
| 246 | + s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm)); |
| 247 | + return true; |
| 248 | + } |
| 249 | + return false; |
| 250 | + } |
| 251 | + |
239 | 252 | protected HighAvailabilityManagerImpl() { |
240 | 253 | } |
241 | 254 |
|
@@ -295,36 +308,44 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate, |
295 | 308 | logger.warn("Scheduling restart for VMs on host {}", host); |
296 | 309 |
|
297 | 310 | final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId()); |
| 311 | + final List<HaWorkVO> pendingHaWorks = _haDao.listPendingHAWorkForHost(host.getId()); |
298 | 312 | final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); |
299 | 313 |
|
300 | 314 | // send an email alert that the host is down |
301 | 315 | StringBuilder sb = null; |
302 | 316 | List<VMInstanceVO> reorderedVMList = new ArrayList<VMInstanceVO>(); |
303 | | - if ((vms != null) && !vms.isEmpty()) { |
| 317 | + int skippedHAVms = 0; |
| 318 | + if (CollectionUtils.isNotEmpty(vms)) { |
304 | 319 | sb = new StringBuilder(); |
305 | 320 | sb.append(" Starting HA on the following VMs:"); |
306 | 321 | // collect list of vm names for the alert email |
307 | | - for (int i = 0; i < vms.size(); i++) { |
308 | | - VMInstanceVO vm = vms.get(i); |
| 322 | + for (VMInstanceVO vm : vms) { |
| 323 | + if (vmHasPendingHAJob(pendingHaWorks, vm)) { |
| 324 | + skippedHAVms++; |
| 325 | + continue; |
| 326 | + } |
309 | 327 | if (vm.getType() == VirtualMachine.Type.User) { |
310 | 328 | reorderedVMList.add(vm); |
311 | 329 | } else { |
312 | 330 | reorderedVMList.add(0, vm); |
313 | 331 | } |
314 | 332 | if (vm.isHaEnabled()) { |
315 | | - sb.append(" " + vm.getHostName()); |
| 333 | + sb.append(" ").append(vm.getHostName()); |
316 | 334 | } |
317 | 335 | } |
318 | 336 | } |
319 | | - |
| 337 | + if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) { |
| 338 | + s_logger.debug(String.format( |
| 339 | + "Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host)); |
| 340 | + return; |
| 341 | + } |
320 | 342 | // send an email alert that the host is down, include VMs |
321 | 343 | HostPodVO podVO = _podDao.findById(host.getPodId()); |
322 | 344 | String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); |
323 | 345 | _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc, |
324 | 346 | "Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : "")); |
325 | 347 |
|
326 | 348 | for (VMInstanceVO vm : reorderedVMList) { |
327 | | - ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId()); |
328 | 349 | if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) { |
329 | 350 | if (logger.isDebugEnabled()){ |
330 | 351 | logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host."); |
|
0 commit comments