Skip to content

Commit 74124af

Browse files
committed
Merge branch '4.20' into improve-logging
2 parents 499df2d + 32af4a2 commit 74124af

File tree

12 files changed

+331
-39
lines changed

12 files changed

+331
-39
lines changed

api/src/main/java/org/apache/cloudstack/vm/UnmanagedInstanceTO.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919

2020
import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils;
2121

22-
import static com.cloud.utils.NumbersUtil.toHumanReadableSize;
23-
2422
import java.util.List;
2523

2624
public class UnmanagedInstanceTO {

engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
*/
3333
public interface HighAvailabilityManager extends Manager {
3434

35-
public ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
35+
ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
3636
"Force High-Availability to happen even if the VM says no.", true, Cluster);
3737

3838
ConfigKey<Integer> HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5",
@@ -112,7 +112,7 @@ enum Step {
112112

113113
void cancelDestroy(VMInstanceVO vm, Long hostId);
114114

115-
void scheduleDestroy(VMInstanceVO vm, long hostId);
115+
boolean scheduleDestroy(VMInstanceVO vm, long hostId);
116116

117117
/**
118118
* Schedule restarts for all vms running on the host.
@@ -143,7 +143,7 @@ enum Step {
143143
* @param host host the virtual machine is on.
144144
* @param type which type of stop is requested.
145145
*/
146-
void scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
146+
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
147147

148148
void cancelScheduledMigrations(HostVO host);
149149

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

Lines changed: 115 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
// under the License.
1717
package com.cloud.ha;
1818

19+
import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone;
20+
1921
import java.util.ArrayList;
2022
import java.util.Date;
2123
import java.util.HashMap;
@@ -121,6 +123,16 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
121123
"Total number of attempts for trying migration of a VM.",
122124
true, ConfigKey.Scope.Global);
123125

126+
public static ConfigKey<Boolean> VmHaEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.enabled", "true",
127+
"Enable/Disable VM High Availability manager, it is enabled by default."
128+
+ " When enabled, the VM HA WorkItems (for VM Stop, Restart, Migration, Destroy) can be created and the scheduled items are executed; and"
129+
+ " When disabled, new VM HA WorkItems are not allowed and the scheduled items are retried until max retries configured at 'vm.ha.migration.max.retries'"
130+
+ " (executed in case HA is re-enabled during retry attempts), and then purged after 'time.between.failures' by the cleanup thread that runs"
131+
+ " regularly at 'time.between.cleanup'", true, Zone);
132+
133+
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
134+
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);
135+
124136
WorkerThread[] _workers;
125137
boolean _stopped;
126138
long _timeToSleep;
@@ -185,7 +197,6 @@ public void setHaPlanners(List<HAPlanner> haPlanners) {
185197
_haPlanners = haPlanners;
186198
}
187199

188-
189200
@Inject
190201
AgentManager _agentMgr;
191202
@Inject
@@ -231,6 +242,15 @@ public Status investigate(final long hostId) {
231242
return Status.Alert;
232243
}
233244

245+
if (!VmHaEnabled.valueIn(host.getDataCenterId())) {
246+
String message = String.format("Unable to investigate the host %s (%d), VM high availability manager is disabled.", host.getName(), hostId);
247+
if (logger.isDebugEnabled()) {
248+
logger.debug(message);
249+
}
250+
sendHostAlert(host, message);
251+
return Status.Alert;
252+
}
253+
234254
Status hostState = null;
235255
for (Investigator investigator : investigators) {
236256
hostState = investigator.isAgentAlive(host);
@@ -260,6 +280,15 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
260280
return;
261281
}
262282

283+
if (!VmHaEnabled.valueIn(host.getDataCenterId())) {
284+
String message = String.format("Unable to schedule restart for VMs on host %s, VM high availability manager is disabled.", host);
285+
if (logger.isDebugEnabled()) {
286+
logger.debug(message);
287+
}
288+
sendHostAlert(host, message);
289+
return;
290+
}
291+
263292
logger.warn("Scheduling restart for VMs on host {}", host);
264293

265294
final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
@@ -313,12 +342,21 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
313342
}
314343

315344
@Override
316-
public void scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
345+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
317346
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);
318347

319348
if (_haDao.hasBeenScheduled(vm.getId(), type)) {
320349
logger.info("There's already a job scheduled to stop " + vm);
321-
return;
350+
return false;
351+
}
352+
353+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
354+
String message = String.format("Unable to schedule stop for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
355+
if (logger.isDebugEnabled()) {
356+
logger.debug(message);
357+
}
358+
sendVMAlert(vm, message);
359+
return false;
322360
}
323361

324362
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
@@ -327,6 +365,7 @@ public void scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
327365
logger.debug("Scheduled " + work);
328366
}
329367
wakeupWorkers();
368+
return true;
330369
}
331370

332371
protected void wakeupWorkers() {
@@ -338,17 +377,37 @@ protected void wakeupWorkers() {
338377

339378
@Override
340379
public boolean scheduleMigration(final VMInstanceVO vm) {
341-
if (vm.getHostId() != null) {
342-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
343-
_haDao.persist(work);
344-
logger.info("Scheduled migration work of VM {} from host {} with HAWork {}", vm, _hostDao.findById(vm.getHostId()), work);
345-
wakeupWorkers();
380+
if (vm.getHostId() == null) {
381+
return false;
382+
}
383+
384+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
385+
String message = String.format("Unable to schedule migration for the VM %s on host %s, VM high availability manager is disabled.", vm, _hostDao.findById(vm.getHostId()));
386+
if (logger.isDebugEnabled()) {
387+
logger.debug(message);
388+
}
389+
sendVMAlert(vm, message);
390+
return false;
346391
}
392+
393+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
394+
_haDao.persist(work);
395+
logger.info("Scheduled migration work of VM {} from host {} with HAWork {}", vm, _hostDao.findById(vm.getHostId()), work);
396+
wakeupWorkers();
347397
return true;
348398
}
349399

350400
@Override
351401
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
402+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
403+
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
404+
if (logger.isDebugEnabled()) {
405+
logger.debug(message);
406+
}
407+
sendVMAlert(vm, message);
408+
return;
409+
}
410+
352411
logger.debug("HA schedule restart");
353412
Long hostId = vm.getHostId();
354413
if (hostId == null) {
@@ -439,7 +498,6 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
439498
}
440499

441500
wakeupWorkers();
442-
443501
}
444502

445503
private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param, Object> params,
@@ -733,13 +791,23 @@ public Long migrate(final HaWorkVO work) {
733791
}
734792

735793
@Override
736-
public void scheduleDestroy(VMInstanceVO vm, long hostId) {
794+
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
795+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
796+
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
797+
if (logger.isDebugEnabled()) {
798+
logger.debug(message);
799+
}
800+
sendVMAlert(vm, message);
801+
return false;
802+
}
803+
737804
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
738805
_haDao.persist(work);
739806
if (logger.isDebugEnabled()) {
740807
logger.debug("Scheduled " + work.toString());
741808
}
742809
wakeupWorkers();
810+
return true;
743811
}
744812

745813
@Override
@@ -890,7 +958,17 @@ private long getRescheduleTime(WorkType workType) {
890958

891959
private void processWork(final HaWorkVO work) {
892960
final WorkType wt = work.getWorkType();
961+
final VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
893962
try {
963+
if (vm != null && !VmHaEnabled.valueIn(vm.getDataCenterId())) {
964+
if (logger.isDebugEnabled()) {
965+
logger.debug(String.format("VM high availability manager is disabled, rescheduling the HA work %s, for the VM %s (id) to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId()));
966+
}
967+
long nextTime = getRescheduleTime(wt);
968+
rescheduleWork(work, nextTime);
969+
return;
970+
}
971+
894972
Long nextTime = null;
895973
if (wt == WorkType.Migration) {
896974
nextTime = migrate(work);
@@ -919,9 +997,10 @@ private void processWork(final HaWorkVO work) {
919997

920998
// if restart failed in the middle due to exception, VM state may has been changed
921999
// recapture into the HA worker so that it can really continue in it next turn
922-
VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
923-
work.setUpdateTime(vm.getUpdated());
924-
work.setPreviousState(vm.getState());
1000+
if (vm != null) {
1001+
work.setUpdateTime(vm.getUpdated());
1002+
work.setPreviousState(vm.getState());
1003+
}
9251004
} finally {
9261005
if (!Step.Done.equals(work.getStep())) {
9271006
if (work.getTimesTried() >= _maxRetries) {
@@ -1126,11 +1205,33 @@ public String getConfigComponentName() {
11261205
public ConfigKey<?>[] getConfigKeys() {
11271206
return new ConfigKey[] {TimeBetweenCleanup, MigrationMaxRetries, TimeToSleep, TimeBetweenFailures,
11281207
StopRetryInterval, RestartRetryInterval, MigrateRetryInterval, InvestigateRetryInterval,
1129-
HAWorkers, ForceHA, KvmHAFenceHostIfHeartbeatFailsOnStorage};
1208+
HAWorkers, ForceHA, VmHaEnabled, VmHaAlertsEnabled, KvmHAFenceHostIfHeartbeatFailsOnStorage};
11301209
}
11311210

11321211
@Override
11331212
public int expungeWorkItemsByVmList(List<Long> vmIds, Long batchSize) {
11341213
return _haDao.expungeByVmList(vmIds, batchSize);
11351214
}
1215+
1216+
private void sendVMAlert(VMInstanceVO vm, String message) {
1217+
if (vm == null || !VmHaAlertsEnabled.valueIn(vm.getDataCenterId())) {
1218+
return;
1219+
}
1220+
AlertManager.AlertType alertType = AlertManager.AlertType.ALERT_TYPE_USERVM;
1221+
if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
1222+
alertType = AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER;
1223+
} else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
1224+
alertType = AlertManager.AlertType.ALERT_TYPE_CONSOLE_PROXY;
1225+
} else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
1226+
alertType = AlertManager.AlertType.ALERT_TYPE_SSVM;
1227+
}
1228+
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), message, message);
1229+
}
1230+
1231+
private void sendHostAlert(HostVO host, String message) {
1232+
if (host == null || !VmHaAlertsEnabled.valueIn(host.getDataCenterId())) {
1233+
return;
1234+
}
1235+
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), message, message);
1236+
}
11361237
}

server/src/main/java/com/cloud/resource/ResourceManagerImpl.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import com.cloud.cpu.CPU;
4242
import com.cloud.exception.StorageConflictException;
4343
import com.cloud.exception.StorageUnavailableException;
44+
import com.cloud.ha.HighAvailabilityManagerImpl;
4445
import com.cloud.host.HostTagVO;
4546
import com.cloud.storage.Volume;
4647
import com.cloud.storage.VolumeVO;
@@ -1362,6 +1363,11 @@ private boolean doMaintain(final long hostId) {
13621363
throw new CloudRuntimeException(String.format("Cannot perform maintain when resource state is %s, host = %s", hostState, host));
13631364
}
13641365

1366+
final List<VMInstanceVO> vms = _vmDao.listByHostId(hostId);
1367+
if (CollectionUtils.isNotEmpty(vms) && !HighAvailabilityManagerImpl.VmHaEnabled.valueIn(host.getDataCenterId())) {
1368+
throw new CloudRuntimeException(String.format("Cannot perform maintain for the host %s (%d) as there are running VMs on it and VM high availability manager is disabled", host.getName(), hostId));
1369+
}
1370+
13651371
final MaintainAnswer answer = (MaintainAnswer)_agentMgr.easySend(hostId, new MaintainCommand());
13661372
if (answer == null || !answer.getResult()) {
13671373
logger.warn("Unable to send MaintainCommand to host: {}", host);
@@ -1381,8 +1387,6 @@ private boolean doMaintain(final long hostId) {
13811387

13821388
/* TODO: move below to listener */
13831389
if (host.getType() == Host.Type.Routing) {
1384-
1385-
final List<VMInstanceVO> vms = _vmDao.listByHostId(hostId);
13861390
if (vms.size() == 0) {
13871391
return true;
13881392
}
@@ -2839,7 +2843,7 @@ public void deleteRoutingHost(final HostVO host, final boolean isForced, final b
28392843
logger.debug("Cannot transmit host {} to Disabled state", host, e);
28402844
}
28412845
for (final VMInstanceVO vm : vms) {
2842-
if ((! HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
2846+
if ((!HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
28432847
logger.debug(String.format("Stopping %s as a part of hostDelete for %s",vm, host));
28442848
try {
28452849
_haMgr.scheduleStop(vm, host.getId(), WorkType.Stop);

0 commit comments

Comments
 (0)