Skip to content

Commit 33a37da

Browse files
authored
server: investigate pending HA work when executing in new MS session (#10167)
For HA work items that are created for host state change, checks must be done when execution is called in a new management server session. A new column, reason, has been added in cloud.op_ha_work table to track the reason for HA work. When HighAvailabilityManager starts it finds and puts all pending HA work items in Investigating state. During execution of the HA work if it is found in investigating state, checks are done to verify if the work is still valid. If the jobs is found to be invalid it is cancelled. Signed-off-by: Abhishek Kumar <[email protected]>
1 parent 34d2a3b commit 33a37da

File tree

11 files changed

+260
-44
lines changed

11 files changed

+260
-44
lines changed

engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ public enum WorkType {
8484
HA; // Restart a VM.
8585
}
8686

87+
enum ReasonType {
88+
Unknown,
89+
HostMaintenance,
90+
HostDown,
91+
HostDegraded;
92+
}
93+
8794
enum Step {
8895
Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error,
8996
}
@@ -92,7 +99,7 @@ enum Step {
9299
* Investigate why a host has disconnected and migrate the VMs on it
93100
* if necessary.
94101
*
95-
* @param host - the host that has disconnected.
102+
* @param hostId - the id of the host that has disconnected.
96103
*/
97104
Status investigate(long hostId);
98105

@@ -109,17 +116,19 @@ enum Step {
109116
* @param investigate must be investigated before we do anything with this vm.
110117
*/
111118
void scheduleRestart(VMInstanceVO vm, boolean investigate);
119+
void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType);
112120

113121
void cancelDestroy(VMInstanceVO vm, Long hostId);
114122

115-
boolean scheduleDestroy(VMInstanceVO vm, long hostId);
123+
boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType);
116124

117125
/**
118126
* Schedule restarts for all vms running on the host.
119127
* @param host host.
120-
* @param investigate TODO
128+
* @param investigate whether to investigate
129+
* @param reasonType reason for HA work
121130
*/
122-
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate);
131+
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType);
123132

124133
/**
125134
* Schedule the vm for migration.
@@ -128,6 +137,7 @@ enum Step {
128137
* @return true if schedule worked.
129138
*/
130139
boolean scheduleMigration(VMInstanceVO vm);
140+
boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);
131141

132142
List<VMInstanceVO> findTakenMigrationWork();
133143

@@ -140,10 +150,11 @@ enum Step {
140150
* 3. Check if a VM has been stopped: WorkType.CheckStop
141151
*
142152
* @param vm virtual machine to stop.
143-
* @param host host the virtual machine is on.
153+
* @param hostId the id of the host the virtual machine is on.
144154
* @param type which type of stop is requested.
145155
*/
146156
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
157+
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType);
147158

148159
void cancelScheduledMigrations(HostVO host);
149160

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -989,7 +989,7 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache,
989989
handleDisconnectWithoutInvestigation(attache, event, true, true);
990990
host = _hostDao.findById(hostId); // Maybe the host magically reappeared?
991991
if (host != null && host.getStatus() == Status.Down) {
992-
_haMgr.scheduleRestartForVmsOnHost(host, true);
992+
_haMgr.scheduleRestartForVmsOnHost(host, true, HighAvailabilityManager.ReasonType.HostDown);
993993
}
994994
return true;
995995
}

engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,6 @@ CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.volumes', 'last_id', 'bigint(20) uns
3535

3636
-- Add used_iops column to support IOPS data in storage stats
3737
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.storage_pool', 'used_iops', 'bigint unsigned DEFAULT NULL COMMENT "IOPS currently in use for this storage pool" ');
38+
39+
-- Add reason column for op_ha_work
40+
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.op_ha_work', 'reason', 'varchar(32) DEFAULT NULL COMMENT "Reason for the HA work"');

server/src/main/java/com/cloud/ha/HaWorkVO.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ public class HaWorkVO implements InternalIdentity {
8686
@Column(name = "tried")
8787
int timesTried;
8888

89+
@Column(name = "reason")
90+
@Enumerated(value = EnumType.STRING)
91+
private HighAvailabilityManager.ReasonType reasonType;
92+
8993
protected HaWorkVO() {
9094
}
9195

@@ -179,7 +183,7 @@ public void setPreviousState(State state) {
179183
}
180184

181185
public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final WorkType workType, final Step step, final long hostId, final State previousState,
182-
final int timesTried, final long updated) {
186+
final int timesTried, final long updated, HighAvailabilityManager.ReasonType reasonType) {
183187
this.workType = workType;
184188
this.type = type;
185189
this.instanceId = instanceId;
@@ -191,6 +195,7 @@ public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final Wor
191195
this.step = step;
192196
this.timeToTry = System.currentTimeMillis() >> 10;
193197
this.updateTime = updated;
198+
this.reasonType = reasonType;
194199
}
195200

196201
@Override
@@ -207,4 +212,12 @@ public String toString() {
207212
.append("]")
208213
.toString();
209214
}
215+
216+
public HighAvailabilityManager.ReasonType getReasonType() {
217+
return reasonType;
218+
}
219+
220+
public void setReasonType(HighAvailabilityManager.ReasonType reasonType) {
221+
this.reasonType = reasonType;
222+
}
210223
}

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone;
2020

2121
import java.util.ArrayList;
22+
import java.util.Arrays;
2223
import java.util.Date;
2324
import java.util.HashMap;
2425
import java.util.List;
@@ -43,6 +44,7 @@
4344
import org.apache.cloudstack.managed.context.ManagedContext;
4445
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
4546
import org.apache.cloudstack.management.ManagementServerHost;
47+
import org.apache.logging.log4j.ThreadContext;
4648

4749
import com.cloud.agent.AgentManager;
4850
import com.cloud.alert.AlertManager;
@@ -90,7 +92,6 @@
9092
import com.cloud.vm.VirtualMachineManager;
9193
import com.cloud.vm.VirtualMachineProfile;
9294
import com.cloud.vm.dao.VMInstanceDao;
93-
import org.apache.logging.log4j.ThreadContext;
9495

9596
/**
9697
* HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored
@@ -133,6 +134,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
133134
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
134135
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);
135136

137+
protected static final List<ReasonType> CancellableWorkReasonTypes =
138+
Arrays.asList(ReasonType.HostMaintenance, ReasonType.HostDown, ReasonType.HostDegraded);
139+
136140
WorkerThread[] _workers;
137141
boolean _stopped;
138142
long _timeToSleep;
@@ -269,8 +273,7 @@ public Status investigate(final long hostId) {
269273
}
270274

271275
@Override
272-
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) {
273-
276+
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate, ReasonType reasonType) {
274277
if (host.getType() != Host.Type.Routing) {
275278
return;
276279
}
@@ -337,12 +340,12 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
337340
logger.debug("VM {} is not on down host {} it is on other host {} VM HA is done", vm, host, hostId);
338341
continue;
339342
}
340-
scheduleRestart(vm, investigate);
343+
scheduleRestart(vm, investigate, reasonType);
341344
}
342345
}
343346

344347
@Override
345-
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
348+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType) {
346349
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);
347350

348351
if (_haDao.hasBeenScheduled(vm.getId(), type)) {
@@ -359,7 +362,7 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
359362
return false;
360363
}
361364

362-
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
365+
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
363366
_haDao.persist(work);
364367
if (logger.isDebugEnabled()) {
365368
logger.debug("Scheduled " + work);
@@ -368,6 +371,11 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
368371
return true;
369372
}
370373

374+
@Override
375+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
376+
return scheduleStop(vm, hostId, type, null);
377+
}
378+
371379
protected void wakeupWorkers() {
372380
logger.debug("Wakeup workers HA");
373381
for (WorkerThread worker : _workers) {
@@ -376,7 +384,7 @@ protected void wakeupWorkers() {
376384
}
377385

378386
@Override
379-
public boolean scheduleMigration(final VMInstanceVO vm) {
387+
public boolean scheduleMigration(final VMInstanceVO vm, ReasonType reasonType) {
380388
if (vm.getHostId() == null) {
381389
return false;
382390
}
@@ -390,15 +398,20 @@ public boolean scheduleMigration(final VMInstanceVO vm) {
390398
return false;
391399
}
392400

393-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
401+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated(), reasonType);
394402
_haDao.persist(work);
395403
logger.info("Scheduled migration work of VM {} from host {} with HAWork {}", vm, _hostDao.findById(vm.getHostId()), work);
396404
wakeupWorkers();
397405
return true;
398406
}
399407

400408
@Override
401-
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
409+
public boolean scheduleMigration(final VMInstanceVO vm) {
410+
return scheduleMigration(vm, null);
411+
}
412+
413+
@Override
414+
public void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType) {
402415
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
403416
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
404417
if (logger.isDebugEnabled()) {
@@ -490,7 +503,7 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
490503
}
491504

492505
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled,
493-
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated());
506+
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated(), reasonType);
494507
_haDao.persist(work);
495508

496509
if (logger.isInfoEnabled()) {
@@ -500,6 +513,11 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
500513
wakeupWorkers();
501514
}
502515

516+
@Override
517+
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
518+
scheduleRestart(vm, investigate, null);
519+
}
520+
503521
private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param, Object> params,
504522
DeploymentPlanner planner) throws InsufficientCapacityException, ResourceUnavailableException,
505523
ConcurrentOperationException, OperationTimedoutException {
@@ -561,6 +579,9 @@ protected Long restart(final HaWorkVO work) {
561579
logger.info("Unable to find vm: " + vmId);
562580
return null;
563581
}
582+
if (checkAndCancelWorkIfNeeded(work)) {
583+
return null;
584+
}
564585

565586
logger.info("HA on " + vm);
566587
if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
@@ -762,6 +783,22 @@ protected Long restart(final HaWorkVO work) {
762783
return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
763784
}
764785

786+
protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work) {
787+
if (!Step.Investigating.equals(work.getStep())) {
788+
return false;
789+
}
790+
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
791+
return false;
792+
}
793+
Status hostStatus = investigate(work.getHostId());
794+
if (!Status.Up.equals(hostStatus)) {
795+
return false;
796+
}
797+
logger.debug("Cancelling {} as it is not needed anymore", () -> work);
798+
work.setStep(Step.Cancelled);
799+
return true;
800+
}
801+
765802
public Long migrate(final HaWorkVO work) {
766803
long vmId = work.getInstanceId();
767804
long srcHostId = work.getHostId();
@@ -772,6 +809,9 @@ public Long migrate(final HaWorkVO work) {
772809
logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
773810
return null;
774811
}
812+
if (checkAndCancelWorkIfNeeded(work)) {
813+
return null;
814+
}
775815
logger.info("Migration attempt: for VM {}from host {}. Starting attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries);
776816
try {
777817
work.setStep(Step.Migrating);
@@ -791,7 +831,7 @@ public Long migrate(final HaWorkVO work) {
791831
}
792832

793833
@Override
794-
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
834+
public boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType) {
795835
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
796836
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
797837
if (logger.isDebugEnabled()) {
@@ -801,7 +841,7 @@ public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
801841
return false;
802842
}
803843

804-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
844+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
805845
_haDao.persist(work);
806846
if (logger.isDebugEnabled()) {
807847
logger.debug("Scheduled " + work.toString());
@@ -838,6 +878,9 @@ protected Long destroyVM(final HaWorkVO work) {
838878
logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
839879
return null;
840880
}
881+
if (checkAndCancelWorkIfNeeded(work)) {
882+
return null;
883+
}
841884
boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
842885
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
843886
if (!expunge && VirtualMachine.State.Destroyed.equals(work.getPreviousState())) {
@@ -872,6 +915,9 @@ protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException {
872915
work.setStep(Step.Done);
873916
return null;
874917
}
918+
if (checkAndCancelWorkIfNeeded(work)) {
919+
return null;
920+
}
875921
logger.info("Stopping " + vm);
876922
try {
877923
if (work.getWorkType() == WorkType.Stop) {
@@ -1057,6 +1103,8 @@ public boolean configure(final String name, final Map<String, Object> xmlParams)
10571103
public boolean start() {
10581104
_stopped = false;
10591105

1106+
_haDao.markPendingWorksAsInvestigating();
1107+
10601108
for (final WorkerThread thread : _workers) {
10611109
thread.start();
10621110
}
@@ -1074,6 +1122,8 @@ public boolean stop() {
10741122

10751123
_executor.shutdown();
10761124

1125+
_haDao.markServerPendingWorksAsInvestigating(_msServer.getId());
1126+
10771127
return true;
10781128
}
10791129

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
8686

8787
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
8888
int expungeByVmList(List<Long> vmIds, Long batchSize);
89+
void markPendingWorksAsInvestigating();
90+
void markServerPendingWorksAsInvestigating(long managementServerId);
8991
}

0 commit comments

Comments
 (0)