Skip to content

Commit a3aa8ff

Browse files
committed
server: investigate pending HA work when executing in new MS session
For HA work items that are created for host state change, checks must be done when execution is called in a new management server session. A new column, reason, has been added in cloud.op_ha_work table to track the reason for HA work. When HighAvailabilityManager starts it finds and puts all pending HA work items in Investigating state. During execution of the HA work if it is found in investigating state, checks are done to verify if the work is still valid. If the jobs is found to be invalid it is cancelled. Signed-off-by: Abhishek Kumar <[email protected]>
1 parent bd488c4 commit a3aa8ff

File tree

11 files changed

+208
-44
lines changed

11 files changed

+208
-44
lines changed

engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ public enum WorkType {
8484
HA; // Restart a VM.
8585
}
8686

87+
enum ReasonType {
88+
Unknown,
89+
HostMaintenance,
90+
HostDown,
91+
HostDegraded;
92+
}
93+
8794
enum Step {
8895
Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error,
8996
}
@@ -92,7 +99,7 @@ enum Step {
9299
* Investigate why a host has disconnected and migrate the VMs on it
93100
* if necessary.
94101
*
95-
* @param host - the host that has disconnected.
102+
* @param hostId - the id of the host that has disconnected.
96103
*/
97104
Status investigate(long hostId);
98105

@@ -109,17 +116,19 @@ enum Step {
109116
* @param investigate must be investigated before we do anything with this vm.
110117
*/
111118
void scheduleRestart(VMInstanceVO vm, boolean investigate);
119+
void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType);
112120

113121
void cancelDestroy(VMInstanceVO vm, Long hostId);
114122

115-
boolean scheduleDestroy(VMInstanceVO vm, long hostId);
123+
boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType);
116124

117125
/**
118126
* Schedule restarts for all vms running on the host.
119127
* @param host host.
120-
* @param investigate TODO
128+
* @param investigate whether to investigate
129+
* @param reasonType reason for HA work
121130
*/
122-
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate);
131+
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType);
123132

124133
/**
125134
* Schedule the vm for migration.
@@ -128,6 +137,7 @@ enum Step {
128137
* @return true if schedule worked.
129138
*/
130139
boolean scheduleMigration(VMInstanceVO vm);
140+
boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);
131141

132142
List<VMInstanceVO> findTakenMigrationWork();
133143

@@ -140,10 +150,11 @@ enum Step {
140150
* 3. Check if a VM has been stopped: WorkType.CheckStop
141151
*
142152
* @param vm virtual machine to stop.
143-
* @param host host the virtual machine is on.
153+
* @param hostId the id of the host the virtual machine is on.
144154
* @param type which type of stop is requested.
145155
*/
146156
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
157+
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType);
147158

148159
void cancelScheduledMigrations(HostVO host);
149160

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -993,7 +993,7 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache,
993993
handleDisconnectWithoutInvestigation(attache, event, true, true);
994994
host = _hostDao.findById(hostId); // Maybe the host magically reappeared?
995995
if (host != null && host.getStatus() == Status.Down) {
996-
_haMgr.scheduleRestartForVmsOnHost(host, true);
996+
_haMgr.scheduleRestartForVmsOnHost(host, true, HighAvailabilityManager.ReasonType.HostDown);
997997
}
998998
return true;
999999
}

engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,6 @@ CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.volumes', 'last_id', 'bigint(20) uns
3535

3636
-- Add used_iops column to support IOPS data in storage stats
3737
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.storage_pool', 'used_iops', 'bigint unsigned DEFAULT NULL COMMENT "IOPS currently in use for this storage pool" ');
38+
39+
-- Add reason column for op_ha_work
40+
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.op_ha_work', 'reason', 'varchar(32) DEFAULT NULL COMMENT "Reason for the HA work"');

server/src/main/java/com/cloud/ha/HaWorkVO.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ public class HaWorkVO implements InternalIdentity {
8686
@Column(name = "tried")
8787
int timesTried;
8888

89+
@Column(name = "reason")
90+
@Enumerated(value = EnumType.STRING)
91+
private HighAvailabilityManager.ReasonType reasonType;
92+
8993
protected HaWorkVO() {
9094
}
9195

@@ -179,7 +183,7 @@ public void setPreviousState(State state) {
179183
}
180184

181185
public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final WorkType workType, final Step step, final long hostId, final State previousState,
182-
final int timesTried, final long updated) {
186+
final int timesTried, final long updated, HighAvailabilityManager.ReasonType reasonType) {
183187
this.workType = workType;
184188
this.type = type;
185189
this.instanceId = instanceId;
@@ -191,6 +195,7 @@ public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final Wor
191195
this.step = step;
192196
this.timeToTry = System.currentTimeMillis() >> 10;
193197
this.updateTime = updated;
198+
this.reasonType = reasonType;
194199
}
195200

196201
@Override
@@ -207,4 +212,12 @@ public String toString() {
207212
.append("]")
208213
.toString();
209214
}
215+
216+
public HighAvailabilityManager.ReasonType getReasonType() {
217+
return reasonType;
218+
}
219+
220+
public void setReasonType(HighAvailabilityManager.ReasonType reasonType) {
221+
this.reasonType = reasonType;
222+
}
210223
}

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

Lines changed: 60 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import org.apache.cloudstack.managed.context.ManagedContext;
4444
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
4545
import org.apache.cloudstack.management.ManagementServerHost;
46+
import org.apache.logging.log4j.ThreadContext;
4647

4748
import com.cloud.agent.AgentManager;
4849
import com.cloud.alert.AlertManager;
@@ -90,7 +91,6 @@
9091
import com.cloud.vm.VirtualMachineManager;
9192
import com.cloud.vm.VirtualMachineProfile;
9293
import com.cloud.vm.dao.VMInstanceDao;
93-
import org.apache.logging.log4j.ThreadContext;
9494

9595
/**
9696
* HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored
@@ -133,6 +133,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
133133
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
134134
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);
135135

136+
protected static final List<ReasonType> CancellableWorkReasonTypes =
137+
List.of(ReasonType.HostMaintenance, ReasonType.HostDown, ReasonType.HostDegraded);
138+
136139
WorkerThread[] _workers;
137140
boolean _stopped;
138141
long _timeToSleep;
@@ -269,8 +272,7 @@ public Status investigate(final long hostId) {
269272
}
270273

271274
@Override
272-
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) {
273-
275+
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate, ReasonType reasonType) {
274276
if (host.getType() != Host.Type.Routing) {
275277
return;
276278
}
@@ -337,12 +339,12 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
337339
logger.debug("VM {} is not on down host {} it is on other host {} VM HA is done", vm, host, hostId);
338340
continue;
339341
}
340-
scheduleRestart(vm, investigate);
342+
scheduleRestart(vm, investigate, reasonType);
341343
}
342344
}
343345

344346
@Override
345-
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
347+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType) {
346348
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);
347349

348350
if (_haDao.hasBeenScheduled(vm.getId(), type)) {
@@ -359,7 +361,7 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
359361
return false;
360362
}
361363

362-
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
364+
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
363365
_haDao.persist(work);
364366
if (logger.isDebugEnabled()) {
365367
logger.debug("Scheduled " + work);
@@ -368,6 +370,11 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
368370
return true;
369371
}
370372

373+
@Override
374+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
375+
return scheduleStop(vm, hostId, type, null);
376+
}
377+
371378
protected void wakeupWorkers() {
372379
logger.debug("Wakeup workers HA");
373380
for (WorkerThread worker : _workers) {
@@ -376,7 +383,7 @@ protected void wakeupWorkers() {
376383
}
377384

378385
@Override
379-
public boolean scheduleMigration(final VMInstanceVO vm) {
386+
public boolean scheduleMigration(final VMInstanceVO vm, ReasonType reasonType) {
380387
if (vm.getHostId() == null) {
381388
return false;
382389
}
@@ -390,15 +397,20 @@ public boolean scheduleMigration(final VMInstanceVO vm) {
390397
return false;
391398
}
392399

393-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
400+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated(), reasonType);
394401
_haDao.persist(work);
395402
logger.info("Scheduled migration work of VM {} from host {} with HAWork {}", vm, _hostDao.findById(vm.getHostId()), work);
396403
wakeupWorkers();
397404
return true;
398405
}
399406

400407
@Override
401-
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
408+
public boolean scheduleMigration(final VMInstanceVO vm) {
409+
return scheduleMigration(vm, null);
410+
}
411+
412+
@Override
413+
public void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType) {
402414
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
403415
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
404416
if (logger.isDebugEnabled()) {
@@ -490,7 +502,7 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
490502
}
491503

492504
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled,
493-
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated());
505+
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated(), reasonType);
494506
_haDao.persist(work);
495507

496508
if (logger.isInfoEnabled()) {
@@ -500,6 +512,11 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
500512
wakeupWorkers();
501513
}
502514

515+
@Override
516+
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
517+
scheduleRestart(vm, investigate, null);
518+
}
519+
503520
private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param, Object> params,
504521
DeploymentPlanner planner) throws InsufficientCapacityException, ResourceUnavailableException,
505522
ConcurrentOperationException, OperationTimedoutException {
@@ -561,6 +578,9 @@ protected Long restart(final HaWorkVO work) {
561578
logger.info("Unable to find vm: " + vmId);
562579
return null;
563580
}
581+
if (checkAndCancelWorkIfNeeded(work, vm)) {
582+
return null;
583+
}
564584

565585
logger.info("HA on " + vm);
566586
if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
@@ -762,6 +782,23 @@ protected Long restart(final HaWorkVO work) {
762782
return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
763783
}
764784

785+
protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work, final VirtualMachine vm) {
786+
if (!Step.Investigating.equals(work.getStep())) {
787+
return false;
788+
}
789+
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
790+
return false;
791+
}
792+
793+
Status hostStatus = investigate(work.getHostId());
794+
if (!Status.Up.equals(hostStatus)) {
795+
return false;
796+
}
797+
logger.debug("Cancelling {} as it is not needed anymore", () -> work);
798+
work.setStep(Step.Cancelled);
799+
return true;
800+
}
801+
765802
public Long migrate(final HaWorkVO work) {
766803
long vmId = work.getInstanceId();
767804
long srcHostId = work.getHostId();
@@ -772,6 +809,9 @@ public Long migrate(final HaWorkVO work) {
772809
logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
773810
return null;
774811
}
812+
if (checkAndCancelWorkIfNeeded(work, vm)) {
813+
return null;
814+
}
775815
logger.info("Migration attempt: for VM {}from host {}. Starting attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries);
776816
try {
777817
work.setStep(Step.Migrating);
@@ -791,7 +831,7 @@ public Long migrate(final HaWorkVO work) {
791831
}
792832

793833
@Override
794-
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
834+
public boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType) {
795835
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
796836
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
797837
if (logger.isDebugEnabled()) {
@@ -801,7 +841,7 @@ public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
801841
return false;
802842
}
803843

804-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
844+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
805845
_haDao.persist(work);
806846
if (logger.isDebugEnabled()) {
807847
logger.debug("Scheduled " + work.toString());
@@ -838,6 +878,9 @@ protected Long destroyVM(final HaWorkVO work) {
838878
logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
839879
return null;
840880
}
881+
if (checkAndCancelWorkIfNeeded(work, vm)) {
882+
return null;
883+
}
841884
boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
842885
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
843886
if (!expunge && VirtualMachine.State.Destroyed.equals(work.getPreviousState())) {
@@ -872,6 +915,9 @@ protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException {
872915
work.setStep(Step.Done);
873916
return null;
874917
}
918+
if (checkAndCancelWorkIfNeeded(work, vm)) {
919+
return null;
920+
}
875921
logger.info("Stopping " + vm);
876922
try {
877923
if (work.getWorkType() == WorkType.Stop) {
@@ -1057,6 +1103,8 @@ public boolean configure(final String name, final Map<String, Object> xmlParams)
10571103
public boolean start() {
10581104
_stopped = false;
10591105

1106+
_haDao.markPendingWorksAsInvestigating();
1107+
10601108
for (final WorkerThread thread : _workers) {
10611109
thread.start();
10621110
}

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,5 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
8686

8787
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
8888
int expungeByVmList(List<Long> vmIds, Long batchSize);
89+
void markPendingWorksAsInvestigating();
8990
}

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,13 @@
3131
import com.cloud.utils.db.SearchCriteria;
3232
import com.cloud.utils.db.SearchCriteria.Op;
3333
import com.cloud.utils.db.TransactionLegacy;
34+
import com.cloud.utils.db.UpdateBuilder;
3435
import com.cloud.utils.exception.CloudRuntimeException;
3536

3637
@Component
3738
public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> implements HighAvailabilityDao {
3839

39-
private final SearchBuilder<HaWorkVO> TBASearch;
40+
protected SearchBuilder<HaWorkVO> TBASearch;
4041
private final SearchBuilder<HaWorkVO> PreviousInstanceSearch;
4142
private final SearchBuilder<HaWorkVO> UntakenMigrationSearch;
4243
private final SearchBuilder<HaWorkVO> CleanupSearch;
@@ -270,4 +271,15 @@ public int expungeByVmList(List<Long> vmIds, Long batchSize) {
270271
sc.setParameters("vmIds", vmIds.toArray());
271272
return batchExpunge(sc, batchSize);
272273
}
274+
275+
@Override
276+
public void markPendingWorksAsInvestigating() {
277+
final SearchCriteria<HaWorkVO> sc = TBASearch.create();
278+
sc.setParameters("time", System.currentTimeMillis() >> 10);
279+
sc.setParameters("step", Step.Done, Step.Cancelled);
280+
HaWorkVO haWorkVO = createForUpdate();
281+
haWorkVO.setStep(Step.Investigating);
282+
UpdateBuilder updateBuilder = getUpdateBuilder(haWorkVO);
283+
update(updateBuilder, sc, null);
284+
}
273285
}

0 commit comments

Comments
 (0)