Skip to content

Commit 34c3218

Browse files
authored
FR86 - server: investigate pending HA work when executing in new MS session (apache#519)
Fixes apache#511 For HA work items that are created for host state change, checks must be done when execution is called in a new management server session. A new column, reason, has been added in cloud.op_ha_work table to track the reason for HA work. When HighAvailabilityManager starts it finds and puts all pending HA work items in Investigating state. During execution of the HA work if it is found in investigating state, checks are done to verify if the work is still valid. If the jobs is found to be invalid it is cancelled. Upstream PR: apache#10167 Signed-off-by: Abhishek Kumar <[email protected]>
1 parent 62e374b commit 34c3218

File tree

11 files changed

+254
-35
lines changed

11 files changed

+254
-35
lines changed

engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ public enum WorkType {
8181
HA; // Restart a VM.
8282
}
8383

84+
enum ReasonType {
85+
Unknown,
86+
HostMaintenance,
87+
HostDown,
88+
HostDegraded;
89+
}
90+
8491
enum Step {
8592
Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error,
8693
}
@@ -89,7 +96,7 @@ enum Step {
8996
* Investigate why a host has disconnected and migrate the VMs on it
9097
* if necessary.
9198
*
92-
* @param host - the host that has disconnected.
99+
* @param hostId - the id of the host that has disconnected.
93100
*/
94101
Status investigate(long hostId);
95102

@@ -106,17 +113,19 @@ enum Step {
106113
* @param investigate must be investigated before we do anything with this vm.
107114
*/
108115
void scheduleRestart(VMInstanceVO vm, boolean investigate);
116+
void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType);
109117

110118
void cancelDestroy(VMInstanceVO vm, Long hostId);
111119

112-
boolean scheduleDestroy(VMInstanceVO vm, long hostId);
120+
boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType);
113121

114122
/**
115123
* Schedule restarts for all vms running on the host.
116124
* @param host host.
117-
* @param investigate TODO
125+
* @param investigate whether to investigate
126+
* @param reasonType reason for HA work
118127
*/
119-
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate);
128+
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType);
120129

121130
/**
122131
* Schedule the vm for migration.
@@ -125,6 +134,7 @@ enum Step {
125134
* @return true if schedule worked.
126135
*/
127136
boolean scheduleMigration(VMInstanceVO vm);
137+
boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);
128138

129139
List<VMInstanceVO> findTakenMigrationWork();
130140

@@ -137,10 +147,11 @@ enum Step {
137147
* 3. Check if a VM has been stopped: WorkType.CheckStop
138148
*
139149
* @param vm virtual machine to stop.
140-
* @param host host the virtual machine is on.
150+
* @param hostId the id of the host the virtual machine is on.
141151
* @param type which type of stop is requested.
142152
*/
143153
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
154+
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType);
144155

145156
void cancelScheduledMigrations(HostVO host);
146157

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1034,7 +1034,7 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache,
10341034
handleDisconnectWithoutInvestigation(attache, event, true, true);
10351035
host = _hostDao.findById(hostId); // Maybe the host magically reappeared?
10361036
if (host != null && host.getStatus() == Status.Down) {
1037-
_haMgr.scheduleRestartForVmsOnHost(host, true);
1037+
_haMgr.scheduleRestartForVmsOnHost(host, true, HighAvailabilityManager.ReasonType.HostDown);
10381038
}
10391039
return true;
10401040
}

engine/schema/src/main/resources/META-INF/db/schema-41811to41812.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,3 +420,6 @@ CREATE VIEW `cloud`.`data_center_view` AS
420420
`cloud`.`dedicated_resources` ON data_center.id = dedicated_resources.data_center_id
421421
left join
422422
`cloud`.`affinity_group` ON dedicated_resources.affinity_group_id = affinity_group.id;
423+
424+
-- Add reason column for op_ha_work
425+
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.op_ha_work', 'reason', 'varchar(32) DEFAULT NULL COMMENT "Reason for the HA work"');

server/src/main/java/com/cloud/ha/HaWorkVO.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ public class HaWorkVO implements InternalIdentity {
8686
@Column(name = "tried")
8787
int timesTried;
8888

89+
@Column(name = "reason")
90+
@Enumerated(value = EnumType.STRING)
91+
private HighAvailabilityManager.ReasonType reasonType;
92+
8993
protected HaWorkVO() {
9094
}
9195

@@ -179,7 +183,7 @@ public void setPreviousState(State state) {
179183
}
180184

181185
public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final WorkType workType, final Step step, final long hostId, final State previousState,
182-
final int timesTried, final long updated) {
186+
final int timesTried, final long updated, HighAvailabilityManager.ReasonType reasonType) {
183187
this.workType = workType;
184188
this.type = type;
185189
this.instanceId = instanceId;
@@ -191,6 +195,7 @@ public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final Wor
191195
this.step = step;
192196
this.timeToTry = System.currentTimeMillis() >> 10;
193197
this.updateTime = updated;
198+
this.reasonType = reasonType;
194199
}
195200

196201
@Override
@@ -207,4 +212,12 @@ public String toString() {
207212
.append("]")
208213
.toString();
209214
}
215+
216+
public HighAvailabilityManager.ReasonType getReasonType() {
217+
return reasonType;
218+
}
219+
220+
public void setReasonType(HighAvailabilityManager.ReasonType reasonType) {
221+
this.reasonType = reasonType;
222+
}
210223
}

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone;
2020

2121
import java.util.ArrayList;
22+
import java.util.Arrays;
2223
import java.util.Date;
2324
import java.util.HashMap;
2425
import java.util.List;
@@ -124,6 +125,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
124125
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
125126
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);
126127

128+
protected static final List<ReasonType> CancellableWorkReasonTypes =
129+
Arrays.asList(ReasonType.HostMaintenance, ReasonType.HostDown, ReasonType.HostDegraded);
130+
127131
WorkerThread[] _workers;
128132
boolean _stopped;
129133
long _timeToSleep;
@@ -252,7 +256,7 @@ public Status investigate(final long hostId) {
252256
}
253257

254258
@Override
255-
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) {
259+
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate, ReasonType reasonType) {
256260
if (host.getType() != Host.Type.Routing) {
257261
return;
258262
}
@@ -319,12 +323,12 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
319323
s_logger.debug(String.format("VM %s is not on down host %s it is on other host %d VM HA is done", vm, host, hostId));
320324
continue;
321325
}
322-
scheduleRestart(vm, investigate);
326+
scheduleRestart(vm, investigate, reasonType);
323327
}
324328
}
325329

326330
@Override
327-
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
331+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType) {
328332
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);
329333

330334
if (_haDao.hasBeenScheduled(vm.getId(), type)) {
@@ -341,7 +345,7 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
341345
return false;
342346
}
343347

344-
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
348+
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
345349
_haDao.persist(work);
346350
if (s_logger.isDebugEnabled()) {
347351
s_logger.debug("Scheduled " + work);
@@ -350,14 +354,19 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
350354
return true;
351355
}
352356

357+
@Override
358+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
359+
return scheduleStop(vm, hostId, type, null);
360+
}
361+
353362
protected void wakeupWorkers() {
354363
for (WorkerThread worker : _workers) {
355364
worker.wakup();
356365
}
357366
}
358367

359368
@Override
360-
public boolean scheduleMigration(final VMInstanceVO vm) {
369+
public boolean scheduleMigration(final VMInstanceVO vm, ReasonType reasonType) {
361370
if (vm.getHostId() == null) {
362371
return false;
363372
}
@@ -371,15 +380,20 @@ public boolean scheduleMigration(final VMInstanceVO vm) {
371380
return false;
372381
}
373382

374-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
383+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated(), reasonType);
375384
_haDao.persist(work);
376385
s_logger.info(String.format("Scheduled migration work of VM %s from host %s with HAWork %s", vm, _hostDao.findById(vm.getHostId()), work));
377386
wakeupWorkers();
378387
return true;
379388
}
380389

381390
@Override
382-
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
391+
public boolean scheduleMigration(final VMInstanceVO vm) {
392+
return scheduleMigration(vm, null);
393+
}
394+
395+
@Override
396+
public void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType) {
383397
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
384398
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
385399
if (s_logger.isDebugEnabled()) {
@@ -470,7 +484,7 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
470484
}
471485

472486
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled,
473-
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated());
487+
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated(), reasonType);
474488
_haDao.persist(work);
475489

476490
if (s_logger.isInfoEnabled()) {
@@ -480,6 +494,11 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
480494
wakeupWorkers();
481495
}
482496

497+
@Override
498+
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
499+
scheduleRestart(vm, investigate, null);
500+
}
501+
483502
protected Long restart(final HaWorkVO work) {
484503
List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
485504
if (items.size() > 0) {
@@ -510,6 +529,9 @@ protected Long restart(final HaWorkVO work) {
510529
s_logger.info("Unable to find vm: " + vmId);
511530
return null;
512531
}
532+
if (checkAndCancelWorkIfNeeded(work)) {
533+
return null;
534+
}
513535

514536
s_logger.info("HA on " + vm);
515537
if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
@@ -690,6 +712,24 @@ protected Long restart(final HaWorkVO work) {
690712
return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
691713
}
692714

715+
protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work) {
716+
if (!Step.Investigating.equals(work.getStep())) {
717+
return false;
718+
}
719+
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
720+
return false;
721+
}
722+
Status hostStatus = investigate(work.getHostId());
723+
if (!Status.Up.equals(hostStatus)) {
724+
return false;
725+
}
726+
if (s_logger.isDebugEnabled()) {
727+
s_logger.debug(String.format("Cancelling %s as it is not needed anymore", work));
728+
}
729+
work.setStep(Step.Cancelled);
730+
return true;
731+
}
732+
693733
public Long migrate(final HaWorkVO work) {
694734
long vmId = work.getInstanceId();
695735
long srcHostId = work.getHostId();
@@ -700,6 +740,9 @@ public Long migrate(final HaWorkVO work) {
700740
s_logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
701741
return null;
702742
}
743+
if (checkAndCancelWorkIfNeeded(work)) {
744+
return null;
745+
}
703746
s_logger.info(String.format("Migration attempt: for VM %s from host %s. Starting attempt: %d/%d times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries));
704747
try {
705748
work.setStep(Step.Migrating);
@@ -719,7 +762,7 @@ public Long migrate(final HaWorkVO work) {
719762
}
720763

721764
@Override
722-
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
765+
public boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType) {
723766
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
724767
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
725768
if (s_logger.isDebugEnabled()) {
@@ -729,7 +772,7 @@ public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
729772
return false;
730773
}
731774

732-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
775+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
733776
_haDao.persist(work);
734777
if (s_logger.isDebugEnabled()) {
735778
s_logger.debug("Scheduled " + work.toString());
@@ -766,6 +809,9 @@ protected Long destroyVM(final HaWorkVO work) {
766809
s_logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
767810
return null;
768811
}
812+
if (checkAndCancelWorkIfNeeded(work)) {
813+
return null;
814+
}
769815
boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
770816
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
771817
if (!expunge && VirtualMachine.State.Destroyed.equals(work.getPreviousState())) {
@@ -800,6 +846,9 @@ protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException {
800846
work.setStep(Step.Done);
801847
return null;
802848
}
849+
if (checkAndCancelWorkIfNeeded(work)) {
850+
return null;
851+
}
803852
s_logger.info("Stopping " + vm);
804853
try {
805854
if (work.getWorkType() == WorkType.Stop) {
@@ -987,6 +1036,8 @@ public boolean configure(final String name, final Map<String, Object> xmlParams)
9871036
public boolean start() {
9881037
_stopped = false;
9891038

1039+
_haDao.markPendingWorksAsInvestigating();
1040+
9901041
for (final WorkerThread thread : _workers) {
9911042
thread.start();
9921043
}
@@ -1004,6 +1055,8 @@ public boolean stop() {
10041055

10051056
_executor.shutdown();
10061057

1058+
_haDao.markServerPendingWorksAsInvestigating(_msServer.getId());
1059+
10071060
return true;
10081061
}
10091062

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
8686

8787
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
8888
int expungeByVmList(List<Long> vmIds, Long batchSize);
89+
void markPendingWorksAsInvestigating();
90+
void markServerPendingWorksAsInvestigating(long managementServerId);
8991
}

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,14 @@
3232
import com.cloud.utils.db.SearchCriteria;
3333
import com.cloud.utils.db.SearchCriteria.Op;
3434
import com.cloud.utils.db.TransactionLegacy;
35+
import com.cloud.utils.db.UpdateBuilder;
3536
import com.cloud.utils.exception.CloudRuntimeException;
3637

3738
@Component
3839
public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> implements HighAvailabilityDao {
3940
private static final Logger s_logger = Logger.getLogger(HighAvailabilityDaoImpl.class);
4041

41-
private final SearchBuilder<HaWorkVO> TBASearch;
42+
protected SearchBuilder<HaWorkVO> TBASearch;
4243
private final SearchBuilder<HaWorkVO> PreviousInstanceSearch;
4344
private final SearchBuilder<HaWorkVO> UntakenMigrationSearch;
4445
private final SearchBuilder<HaWorkVO> CleanupSearch;
@@ -272,4 +273,31 @@ public int expungeByVmList(List<Long> vmIds, Long batchSize) {
272273
sc.setParameters("vmIds", vmIds.toArray());
273274
return batchExpunge(sc, batchSize);
274275
}
276+
277+
protected void updatePendingWorkToInvestigating(SearchCriteria<HaWorkVO> sc) {
278+
HaWorkVO haWorkVO = createForUpdate();
279+
haWorkVO.setStep(Step.Investigating);
280+
UpdateBuilder updateBuilder = getUpdateBuilder(haWorkVO);
281+
update(updateBuilder, sc, null);
282+
}
283+
284+
@Override
285+
public void markPendingWorksAsInvestigating() {
286+
final SearchCriteria<HaWorkVO> sc = TBASearch.create();
287+
sc.setParameters("time", System.currentTimeMillis() >> 10);
288+
sc.setParameters("step", Step.Done, Step.Cancelled);
289+
updatePendingWorkToInvestigating(sc);
290+
}
291+
292+
@Override
293+
public void markServerPendingWorksAsInvestigating(long managementServerId) {
294+
SearchBuilder<HaWorkVO> sb = createSearchBuilder();
295+
sb.and("server", sb.entity().getServerId(), Op.EQ);
296+
sb.and("step", sb.entity().getStep(), Op.NIN);
297+
sb.done();
298+
SearchCriteria<HaWorkVO> sc = sb.create();
299+
sc.setParameters("server", managementServerId);
300+
sc.setParameters("step", Step.Done, Step.Cancelled);
301+
updatePendingWorkToInvestigating(sc);
302+
}
275303
}

0 commit comments

Comments
 (0)