Handle scheduling failures more gracefully (#4486)

JamesMurkin · web-flow · commit 189e47169c58 · 2025-09-30T17:16:02.000+01:00
Currently if we have a scheduling failure, we continuously run
scheduling rounds until we succeed
- Often it won't succeed as the state of the system can't change much
due to not running any more reconciliations

Now we will run continue to run reconciliation rounds between scheduling
rounds, even if the scheduling round fails
- We do this by setting `previousSchedulingRoundEnd` even if the
scheduling round failed

This allows the system to continue working even if we have a bug:
- It also allows operators to cancel the jobs causing the bug / the
state to evolve so future scheduling rounds may not exhibit the bug

Signed-off-by: JamesMurkin &lt;jamesmurkin@hotmail.com&gt;
diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
@@ -66,8 +66,6 @@ type Scheduler struct {
 	// Used to penalize short-running jobs by pretending they
 	// ran for some minimum length when calculating costs.
 	shortJobPenalty *scheduling.ShortJobPenalty
-	// The time the previous scheduling round ended
-	previousSchedulingRoundEnd time.Time
 	// Used for timing decisions (e.g., sleep).
 	// Injected here so that we can mock it out for testing.
 	clock clock.WithTicker
@@ -110,26 +108,25 @@ func NewScheduler(
 	marketDrivenPools []string,
 ) (*Scheduler, error) {
 	return &Scheduler{
-		jobRepository:              jobRepository,
-		executorRepository:         executorRepository,
-		schedulingAlgo:             schedulingAlgo,
-		leaderController:           leaderController,
-		publisher:                  publisher,
-		submitChecker:              submitChecker,
-		jobDb:                      jobDb,
-		clock:                      clock.RealClock{},
-		cyclePeriod:                cyclePeriod,
-		schedulePeriod:             schedulePeriod,
-		previousSchedulingRoundEnd: time.Time{},
-		executorTimeout:            executorTimeout,
-		bidPriceProvider:           bidPriceProvider,
-		shortJobPenalty:            shortJobPenalty,
-		maxAttemptedRuns:           maxAttemptedRuns,
-		nodeIdLabel:                nodeIdLabel,
-		jobsSerial:                 -1,
-		runsSerial:                 -1,
-		metrics:                    metrics,
-		marketDrivenPools:          marketDrivenPools,
+		jobRepository:      jobRepository,
+		executorRepository: executorRepository,
+		schedulingAlgo:     schedulingAlgo,
+		leaderController:   leaderController,
+		publisher:          publisher,
+		submitChecker:      submitChecker,
+		jobDb:              jobDb,
+		clock:              clock.RealClock{},
+		cyclePeriod:        cyclePeriod,
+		schedulePeriod:     schedulePeriod,
+		executorTimeout:    executorTimeout,
+		bidPriceProvider:   bidPriceProvider,
+		shortJobPenalty:    shortJobPenalty,
+		maxAttemptedRuns:   maxAttemptedRuns,
+		nodeIdLabel:        nodeIdLabel,
+		jobsSerial:         -1,
+		runsSerial:         -1,
+		metrics:            metrics,
+		marketDrivenPools:  marketDrivenPools,
 	}, nil
 }
 
@@ -152,6 +149,8 @@ func (s *Scheduler) Run(ctx *armadacontext.Context) error {
 
 	ticker := s.clock.NewTicker(s.cyclePeriod)
 	prevLeaderToken := leader.InvalidLeaderToken()
+
+	previousSchedulingRoundEnd := time.Time{}
 	cycleNumber := 0
 	for {
 		select {
@@ -188,12 +187,15 @@ func (s *Scheduler) Run(ctx *armadacontext.Context) error {
 			// and we must invalidate the held leader token to trigger flushing Pulsar at the next cycle.
 			//
 			// TODO: Once the Pulsar client supports transactions, we can guarantee consistency even in case of errors.
-			shouldSchedule := s.clock.Now().Sub(s.previousSchedulingRoundEnd) > s.schedulePeriod
+			shouldSchedule := s.clock.Now().Sub(previousSchedulingRoundEnd) > s.schedulePeriod
 			if !shouldSchedule {
 				ctx.Info("Won't schedule this cycle as still within schedulePeriod")
 			}
 
 			result, err := s.cycle(ctx, fullUpdate, leaderToken, shouldSchedule, cycleNumber)
+			if shouldSchedule {
+				previousSchedulingRoundEnd = s.clock.Now()
+			}
 			if err != nil {
 				ctx.Logger().WithStacktrace(err).Error("scheduling cycle failure")
 				leaderToken = leader.InvalidLeaderToken()
@@ -344,7 +346,6 @@ func (s *Scheduler) cycle(ctx *armadacontext.Context, updateAll bool, leaderToke
 			return overallSchedulerResult, err
 		}
 		events = append(events, resultEvents...)
-		s.previousSchedulingRoundEnd = s.clock.Now()
 
 		overallSchedulerResult = *result
 	}