1919import static org .apache .cloudstack .framework .config .ConfigKey .Scope .Zone ;
2020
2121import java .util .ArrayList ;
22+ import java .util .Arrays ;
2223import java .util .Date ;
2324import java .util .HashMap ;
2425import java .util .List ;
4344import org .apache .cloudstack .managed .context .ManagedContext ;
4445import org .apache .cloudstack .managed .context .ManagedContextRunnable ;
4546import org .apache .cloudstack .management .ManagementServerHost ;
47+ import org .apache .logging .log4j .ThreadContext ;
4648
4749import com .cloud .agent .AgentManager ;
4850import com .cloud .alert .AlertManager ;
9092import com .cloud .vm .VirtualMachineManager ;
9193import com .cloud .vm .VirtualMachineProfile ;
9294import com .cloud .vm .dao .VMInstanceDao ;
93- import org .apache .logging .log4j .ThreadContext ;
9495
9596/**
9697 * HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored
@@ -133,6 +134,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
133134 protected static ConfigKey <Boolean > VmHaAlertsEnabled = new ConfigKey <>("Advanced" , Boolean .class , "vm.ha.alerts.enabled" , "true" ,
134135 "Enable/Disable alerts for the VM HA operations, it is enabled by default." , true , Zone );
135136
137+ protected static final List <ReasonType > CancellableWorkReasonTypes =
138+ Arrays .asList (ReasonType .HostMaintenance , ReasonType .HostDown , ReasonType .HostDegraded );
139+
136140 WorkerThread [] _workers ;
137141 boolean _stopped ;
138142 long _timeToSleep ;
@@ -269,8 +273,7 @@ public Status investigate(final long hostId) {
269273 }
270274
271275 @ Override
272- public void scheduleRestartForVmsOnHost (final HostVO host , boolean investigate ) {
273-
276+ public void scheduleRestartForVmsOnHost (final HostVO host , boolean investigate , ReasonType reasonType ) {
274277 if (host .getType () != Host .Type .Routing ) {
275278 return ;
276279 }
@@ -337,12 +340,12 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
337340 logger .debug ("VM {} is not on down host {} it is on other host {} VM HA is done" , vm , host , hostId );
338341 continue ;
339342 }
340- scheduleRestart (vm , investigate );
343+ scheduleRestart (vm , investigate , reasonType );
341344 }
342345 }
343346
344347 @ Override
345- public boolean scheduleStop (VMInstanceVO vm , long hostId , WorkType type ) {
348+ public boolean scheduleStop (VMInstanceVO vm , long hostId , WorkType type , ReasonType reasonType ) {
346349 assert (type == WorkType .CheckStop || type == WorkType .ForceStop || type == WorkType .Stop );
347350
348351 if (_haDao .hasBeenScheduled (vm .getId (), type )) {
@@ -359,7 +362,7 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
359362 return false ;
360363 }
361364
362- HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), type , Step .Scheduled , hostId , vm .getState (), 0 , vm .getUpdated ());
365+ HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), type , Step .Scheduled , hostId , vm .getState (), 0 , vm .getUpdated (), reasonType );
363366 _haDao .persist (work );
364367 if (logger .isDebugEnabled ()) {
365368 logger .debug ("Scheduled " + work );
@@ -368,6 +371,11 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
368371 return true ;
369372 }
370373
374+ @ Override
375+ public boolean scheduleStop (VMInstanceVO vm , long hostId , WorkType type ) {
376+ return scheduleStop (vm , hostId , type , null );
377+ }
378+
371379 protected void wakeupWorkers () {
372380 logger .debug ("Wakeup workers HA" );
373381 for (WorkerThread worker : _workers ) {
@@ -376,7 +384,7 @@ protected void wakeupWorkers() {
376384 }
377385
378386 @ Override
379- public boolean scheduleMigration (final VMInstanceVO vm ) {
387+ public boolean scheduleMigration (final VMInstanceVO vm , ReasonType reasonType ) {
380388 if (vm .getHostId () == null ) {
381389 return false ;
382390 }
@@ -390,15 +398,20 @@ public boolean scheduleMigration(final VMInstanceVO vm) {
390398 return false ;
391399 }
392400
393- final HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .Migration , Step .Scheduled , vm .getHostId (), vm .getState (), 0 , vm .getUpdated ());
401+ final HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .Migration , Step .Scheduled , vm .getHostId (), vm .getState (), 0 , vm .getUpdated (), reasonType );
394402 _haDao .persist (work );
395403 logger .info ("Scheduled migration work of VM {} from host {} with HAWork {}" , vm , _hostDao .findById (vm .getHostId ()), work );
396404 wakeupWorkers ();
397405 return true ;
398406 }
399407
400408 @ Override
401- public void scheduleRestart (VMInstanceVO vm , boolean investigate ) {
409+ public boolean scheduleMigration (final VMInstanceVO vm ) {
410+ return scheduleMigration (vm , null );
411+ }
412+
413+ @ Override
414+ public void scheduleRestart (VMInstanceVO vm , boolean investigate , ReasonType reasonType ) {
402415 if (!VmHaEnabled .valueIn (vm .getDataCenterId ())) {
403416 String message = String .format ("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled." , vm .getName (), vm .getId ());
404417 if (logger .isDebugEnabled ()) {
@@ -490,7 +503,7 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
490503 }
491504
492505 HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .HA , investigate ? Step .Investigating : Step .Scheduled ,
493- hostId != null ? hostId : 0L , vm .getState (), timesTried , vm .getUpdated ());
506+ hostId != null ? hostId : 0L , vm .getState (), timesTried , vm .getUpdated (), reasonType );
494507 _haDao .persist (work );
495508
496509 if (logger .isInfoEnabled ()) {
@@ -500,6 +513,11 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
500513 wakeupWorkers ();
501514 }
502515
516+ @ Override
517+ public void scheduleRestart (VMInstanceVO vm , boolean investigate ) {
518+ scheduleRestart (vm , investigate , null );
519+ }
520+
503521 private void startVm (VirtualMachine vm , Map <VirtualMachineProfile .Param , Object > params ,
504522 DeploymentPlanner planner ) throws InsufficientCapacityException , ResourceUnavailableException ,
505523 ConcurrentOperationException , OperationTimedoutException {
@@ -561,6 +579,9 @@ protected Long restart(final HaWorkVO work) {
561579 logger .info ("Unable to find vm: " + vmId );
562580 return null ;
563581 }
582+ if (checkAndCancelWorkIfNeeded (work )) {
583+ return null ;
584+ }
564585
565586 logger .info ("HA on " + vm );
566587 if (vm .getState () != work .getPreviousState () || vm .getUpdated () != work .getUpdateTime ()) {
@@ -762,6 +783,22 @@ protected Long restart(final HaWorkVO work) {
762783 return (System .currentTimeMillis () >> 10 ) + _restartRetryInterval ;
763784 }
764785
786+ protected boolean checkAndCancelWorkIfNeeded (final HaWorkVO work ) {
787+ if (!Step .Investigating .equals (work .getStep ())) {
788+ return false ;
789+ }
790+ if (!CancellableWorkReasonTypes .contains (work .getReasonType ())) {
791+ return false ;
792+ }
793+ Status hostStatus = investigate (work .getHostId ());
794+ if (!Status .Up .equals (hostStatus )) {
795+ return false ;
796+ }
797+ logger .debug ("Cancelling {} as it is not needed anymore" , () -> work );
798+ work .setStep (Step .Cancelled );
799+ return true ;
800+ }
801+
765802 public Long migrate (final HaWorkVO work ) {
766803 long vmId = work .getInstanceId ();
767804 long srcHostId = work .getHostId ();
@@ -772,6 +809,9 @@ public Long migrate(final HaWorkVO work) {
772809 logger .info ("Unable to find vm: " + vmId + ", skipping migrate." );
773810 return null ;
774811 }
812+ if (checkAndCancelWorkIfNeeded (work )) {
813+ return null ;
814+ }
775815 logger .info ("Migration attempt: for VM {}from host {}. Starting attempt: {}/{} times." , vm , srcHost , 1 + work .getTimesTried (), _maxRetries );
776816 try {
777817 work .setStep (Step .Migrating );
@@ -791,7 +831,7 @@ public Long migrate(final HaWorkVO work) {
791831 }
792832
793833 @ Override
794- public boolean scheduleDestroy (VMInstanceVO vm , long hostId ) {
834+ public boolean scheduleDestroy (VMInstanceVO vm , long hostId , ReasonType reasonType ) {
795835 if (!VmHaEnabled .valueIn (vm .getDataCenterId ())) {
796836 String message = String .format ("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled." , vm .getName (), vm .getId (), hostId );
797837 if (logger .isDebugEnabled ()) {
@@ -801,7 +841,7 @@ public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
801841 return false ;
802842 }
803843
804- final HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .Destroy , Step .Scheduled , hostId , vm .getState (), 0 , vm .getUpdated ());
844+ final HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .Destroy , Step .Scheduled , hostId , vm .getState (), 0 , vm .getUpdated (), reasonType );
805845 _haDao .persist (work );
806846 if (logger .isDebugEnabled ()) {
807847 logger .debug ("Scheduled " + work .toString ());
@@ -838,6 +878,9 @@ protected Long destroyVM(final HaWorkVO work) {
838878 logger .info ("No longer can find VM " + work .getInstanceId () + ". Throwing away " + work );
839879 return null ;
840880 }
881+ if (checkAndCancelWorkIfNeeded (work )) {
882+ return null ;
883+ }
841884 boolean expunge = VirtualMachine .Type .SecondaryStorageVm .equals (vm .getType ())
842885 || VirtualMachine .Type .ConsoleProxy .equals (vm .getType ());
843886 if (!expunge && VirtualMachine .State .Destroyed .equals (work .getPreviousState ())) {
@@ -872,6 +915,9 @@ protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException {
872915 work .setStep (Step .Done );
873916 return null ;
874917 }
918+ if (checkAndCancelWorkIfNeeded (work )) {
919+ return null ;
920+ }
875921 logger .info ("Stopping " + vm );
876922 try {
877923 if (work .getWorkType () == WorkType .Stop ) {
@@ -1057,6 +1103,8 @@ public boolean configure(final String name, final Map<String, Object> xmlParams)
10571103 public boolean start () {
10581104 _stopped = false ;
10591105
1106+ _haDao .markPendingWorksAsInvestigating ();
1107+
10601108 for (final WorkerThread thread : _workers ) {
10611109 thread .start ();
10621110 }
@@ -1074,6 +1122,8 @@ public boolean stop() {
10741122
10751123 _executor .shutdown ();
10761124
1125+ _haDao .markServerPendingWorksAsInvestigating (_msServer .getId ());
1126+
10771127 return true ;
10781128 }
10791129
0 commit comments