Skip to content

Commit 5d83f49

Browse files
sureshanapartidhslove
authored andcommitted
Support to enable/disable VM High Availability manager and related alerts (apache#10118)
- Adds new config 'vm.ha.enabled' with Zone scope, to enable/disable VM High Availability manager. This is enable by default (for backward compatibilty). When enabled, the VM HA WorkItems (for VM Stop, Restart, Migration, Destroy) can be created and the scheduled items are executed. When disabled, new VM HA WorkItems are not allowed and the scheduled items are retried until max retries configured at 'vm.ha.migration.max.retries' (executed in case HA is re-enabled during retry attempts), and then purged after 'time.between.failures' by the cleanup thread that runs regularly at 'time.between.cleanup'. - Adds new config 'vm.ha.alerts.enabled' with Zone scope, to enable/disable alerts for the VM HA operations. This is enabled by default.
1 parent 023b98a commit 5d83f49

File tree

4 files changed

+281
-20
lines changed

4 files changed

+281
-20
lines changed

engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
*/
3333
public interface HighAvailabilityManager extends Manager {
3434

35-
public ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
35+
ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
3636
"Force High-Availability to happen even if the VM says no.", true, Cluster);
3737

3838
ConfigKey<Integer> HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5",
@@ -112,7 +112,7 @@ enum Step {
112112

113113
void cancelDestroy(VMInstanceVO vm, Long hostId);
114114

115-
void scheduleDestroy(VMInstanceVO vm, long hostId);
115+
boolean scheduleDestroy(VMInstanceVO vm, long hostId);
116116

117117
/**
118118
* Schedule restarts for all vms running on the host.
@@ -143,7 +143,7 @@ enum Step {
143143
* @param host host the virtual machine is on.
144144
* @param type which type of stop is requested.
145145
*/
146-
void scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
146+
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
147147

148148
void cancelScheduledMigrations(HostVO host);
149149

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

Lines changed: 115 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
// under the License.
1717
package com.cloud.ha;
1818

19+
import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone;
20+
1921
import java.util.ArrayList;
2022
import java.util.Date;
2123
import java.util.HashMap;
@@ -121,6 +123,16 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
121123
"Total number of attempts for trying migration of a VM.",
122124
true, ConfigKey.Scope.Global);
123125

126+
public static ConfigKey<Boolean> VmHaEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.enabled", "true",
127+
"Enable/Disable VM High Availability manager, it is enabled by default."
128+
+ " When enabled, the VM HA WorkItems (for VM Stop, Restart, Migration, Destroy) can be created and the scheduled items are executed; and"
129+
+ " When disabled, new VM HA WorkItems are not allowed and the scheduled items are retried until max retries configured at 'vm.ha.migration.max.retries'"
130+
+ " (executed in case HA is re-enabled during retry attempts), and then purged after 'time.between.failures' by the cleanup thread that runs"
131+
+ " regularly at 'time.between.cleanup'", true, Zone);
132+
133+
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
134+
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);
135+
124136
WorkerThread[] _workers;
125137
boolean _stopped;
126138
long _timeToSleep;
@@ -185,7 +197,6 @@ public void setHaPlanners(List<HAPlanner> haPlanners) {
185197
_haPlanners = haPlanners;
186198
}
187199

188-
189200
@Inject
190201
AgentManager _agentMgr;
191202
@Inject
@@ -231,6 +242,15 @@ public Status investigate(final long hostId) {
231242
return Status.Alert;
232243
}
233244

245+
if (!VmHaEnabled.valueIn(host.getDataCenterId())) {
246+
String message = String.format("Unable to investigate the host %s (%d), VM high availability manager is disabled.", host.getName(), hostId);
247+
if (logger.isDebugEnabled()) {
248+
logger.debug(message);
249+
}
250+
sendHostAlert(host, message);
251+
return Status.Alert;
252+
}
253+
234254
Status hostState = null;
235255
for (Investigator investigator : investigators) {
236256
hostState = investigator.isAgentAlive(host);
@@ -260,6 +280,15 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
260280
return;
261281
}
262282

283+
if (!VmHaEnabled.valueIn(host.getDataCenterId())) {
284+
String message = String.format("Unable to schedule restart for VMs on host %s (%d), VM high availability manager is disabled.", host.getName(), host.getId());
285+
if (logger.isDebugEnabled()) {
286+
logger.debug(message);
287+
}
288+
sendHostAlert(host, message);
289+
return;
290+
}
291+
263292
logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName());
264293

265294
final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
@@ -314,12 +343,21 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
314343
}
315344

316345
@Override
317-
public void scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
346+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
318347
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);
319348

320349
if (_haDao.hasBeenScheduled(vm.getId(), type)) {
321350
logger.info("There's already a job scheduled to stop " + vm);
322-
return;
351+
return false;
352+
}
353+
354+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
355+
String message = String.format("Unable to schedule stop for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
356+
if (logger.isDebugEnabled()) {
357+
logger.debug(message);
358+
}
359+
sendVMAlert(vm, message);
360+
return false;
323361
}
324362

325363
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
@@ -328,6 +366,7 @@ public void scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
328366
logger.debug("Scheduled " + work);
329367
}
330368
wakeupWorkers();
369+
return true;
331370
}
332371

333372
protected void wakeupWorkers() {
@@ -339,17 +378,37 @@ protected void wakeupWorkers() {
339378

340379
@Override
341380
public boolean scheduleMigration(final VMInstanceVO vm) {
342-
if (vm.getHostId() != null) {
343-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
344-
_haDao.persist(work);
345-
logger.info("Scheduled migration work of VM " + vm.getUuid() + " from host " + _hostDao.findById(vm.getHostId()) + " with HAWork " + work);
346-
wakeupWorkers();
381+
if (vm.getHostId() == null) {
382+
return false;
383+
}
384+
385+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
386+
String message = String.format("Unable to schedule migration for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), vm.getHostId());
387+
if (logger.isDebugEnabled()) {
388+
logger.debug(message);
389+
}
390+
sendVMAlert(vm, message);
391+
return false;
347392
}
393+
394+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
395+
_haDao.persist(work);
396+
logger.info("Scheduled migration work of VM " + vm.getUuid() + " from host " + _hostDao.findById(vm.getHostId()) + " with HAWork " + work);
397+
wakeupWorkers();
348398
return true;
349399
}
350400

351401
@Override
352402
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
403+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
404+
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
405+
if (logger.isDebugEnabled()) {
406+
logger.debug(message);
407+
}
408+
sendVMAlert(vm, message);
409+
return;
410+
}
411+
353412
logger.debug("HA schedule restart");
354413
Long hostId = vm.getHostId();
355414
if (hostId == null) {
@@ -440,7 +499,6 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
440499
}
441500

442501
wakeupWorkers();
443-
444502
}
445503

446504
private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param, Object> params,
@@ -737,13 +795,23 @@ public Long migrate(final HaWorkVO work) {
737795
}
738796

739797
@Override
740-
public void scheduleDestroy(VMInstanceVO vm, long hostId) {
798+
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
799+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
800+
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
801+
if (logger.isDebugEnabled()) {
802+
logger.debug(message);
803+
}
804+
sendVMAlert(vm, message);
805+
return false;
806+
}
807+
741808
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
742809
_haDao.persist(work);
743810
if (logger.isDebugEnabled()) {
744811
logger.debug("Scheduled " + work.toString());
745812
}
746813
wakeupWorkers();
814+
return true;
747815
}
748816

749817
@Override
@@ -892,7 +960,17 @@ private long getRescheduleTime(WorkType workType) {
892960

893961
private void processWork(final HaWorkVO work) {
894962
final WorkType wt = work.getWorkType();
963+
final VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
895964
try {
965+
if (vm != null && !VmHaEnabled.valueIn(vm.getDataCenterId())) {
966+
if (logger.isDebugEnabled()) {
967+
logger.debug(String.format("VM high availability manager is disabled, rescheduling the HA work %s, for the VM %s (id) to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId()));
968+
}
969+
long nextTime = getRescheduleTime(wt);
970+
rescheduleWork(work, nextTime);
971+
return;
972+
}
973+
896974
Long nextTime = null;
897975
if (wt == WorkType.Migration) {
898976
nextTime = migrate(work);
@@ -921,9 +999,10 @@ private void processWork(final HaWorkVO work) {
921999

9221000
// if restart failed in the middle due to exception, VM state may has been changed
9231001
// recapture into the HA worker so that it can really continue in it next turn
924-
VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
925-
work.setUpdateTime(vm.getUpdated());
926-
work.setPreviousState(vm.getState());
1002+
if (vm != null) {
1003+
work.setUpdateTime(vm.getUpdated());
1004+
work.setPreviousState(vm.getState());
1005+
}
9271006
} finally {
9281007
if (!Step.Done.equals(work.getStep())) {
9291008
if (work.getTimesTried() >= _maxRetries) {
@@ -1128,11 +1207,33 @@ public String getConfigComponentName() {
11281207
public ConfigKey<?>[] getConfigKeys() {
11291208
return new ConfigKey[] {TimeBetweenCleanup, MigrationMaxRetries, TimeToSleep, TimeBetweenFailures,
11301209
StopRetryInterval, RestartRetryInterval, MigrateRetryInterval, InvestigateRetryInterval,
1131-
HAWorkers, ForceHA, KvmHAFenceHostIfHeartbeatFailsOnStorage};
1210+
HAWorkers, ForceHA, VmHaEnabled, VmHaAlertsEnabled, KvmHAFenceHostIfHeartbeatFailsOnStorage};
11321211
}
11331212

11341213
@Override
11351214
public int expungeWorkItemsByVmList(List<Long> vmIds, Long batchSize) {
11361215
return _haDao.expungeByVmList(vmIds, batchSize);
11371216
}
1217+
1218+
private void sendVMAlert(VMInstanceVO vm, String message) {
1219+
if (vm == null || !VmHaAlertsEnabled.valueIn(vm.getDataCenterId())) {
1220+
return;
1221+
}
1222+
AlertManager.AlertType alertType = AlertManager.AlertType.ALERT_TYPE_USERVM;
1223+
if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
1224+
alertType = AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER;
1225+
} else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
1226+
alertType = AlertManager.AlertType.ALERT_TYPE_CONSOLE_PROXY;
1227+
} else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
1228+
alertType = AlertManager.AlertType.ALERT_TYPE_SSVM;
1229+
}
1230+
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), message, message);
1231+
}
1232+
1233+
private void sendHostAlert(HostVO host, String message) {
1234+
if (host == null || !VmHaAlertsEnabled.valueIn(host.getDataCenterId())) {
1235+
return;
1236+
}
1237+
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), message, message);
1238+
}
11381239
}

server/src/main/java/com/cloud/resource/ResourceManagerImpl.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import com.cloud.cpu.CPU;
4242
import com.cloud.exception.StorageConflictException;
4343
import com.cloud.exception.StorageUnavailableException;
44+
import com.cloud.ha.HighAvailabilityManagerImpl;
4445
import com.cloud.host.HostTagVO;
4546
import com.cloud.storage.Volume;
4647
import com.cloud.storage.VolumeVO;
@@ -1363,6 +1364,11 @@ private boolean doMaintain(final long hostId) {
13631364
throw new CloudRuntimeException("Cannot perform maintain when resource state is " + hostState + ", hostId = " + hostId);
13641365
}
13651366

1367+
final List<VMInstanceVO> vms = _vmDao.listByHostId(hostId);
1368+
if (CollectionUtils.isNotEmpty(vms) && !HighAvailabilityManagerImpl.VmHaEnabled.valueIn(host.getDataCenterId())) {
1369+
throw new CloudRuntimeException(String.format("Cannot perform maintain for the host %s (%d) as there are running VMs on it and VM high availability manager is disabled", host.getName(), hostId));
1370+
}
1371+
13661372
final MaintainAnswer answer = (MaintainAnswer)_agentMgr.easySend(hostId, new MaintainCommand());
13671373
if (answer == null || !answer.getResult()) {
13681374
logger.warn("Unable to send MaintainCommand to host: " + hostId);
@@ -1382,8 +1388,6 @@ private boolean doMaintain(final long hostId) {
13821388

13831389
/* TODO: move below to listener */
13841390
if (host.getType() == Host.Type.Routing) {
1385-
1386-
final List<VMInstanceVO> vms = _vmDao.listByHostId(hostId);
13871391
if (vms.size() == 0) {
13881392
return true;
13891393
}
@@ -2841,7 +2845,7 @@ public void deleteRoutingHost(final HostVO host, final boolean isForced, final b
28412845
logger.debug("Cannot transmit host " + host.getId() + " to Disabled state", e);
28422846
}
28432847
for (final VMInstanceVO vm : vms) {
2844-
if ((! HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
2848+
if ((!HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
28452849
logger.debug(String.format("Stopping %s as a part of hostDelete for %s",vm, host));
28462850
try {
28472851
_haMgr.scheduleStop(vm, host.getId(), WorkType.Stop);

0 commit comments

Comments
 (0)