cadence-workflow · arzonus · Feb 19, 2026 · Feb 19, 2026 · gitar-bot · Feb 19, 2026
@@ -183,15 +183,16 @@ func (p *namespaceProcessor) runProcess(ctx context.Context) {
 
 // runRebalancingLoop handles shard assignment and redistribution.
 func (p *namespaceProcessor) runRebalancingLoop(ctx context.Context) {
+	// Buffered channel to allow one pending rebalance trigger.
+	triggerChan := make(chan string, 1)
 
 	// Perform an initial rebalance on startup.
 	err := p.rebalanceShards(ctx)
 	if err != nil {
 		p.logger.Error("initial rebalance failed", tag.Error(err))
 	}
 
-	updateChan, err := p.runRebalanceTriggeringLoop(ctx)
-	if err != nil {
+	if err := p.runRebalanceTriggeringLoop(ctx, triggerChan); err != nil {
 		p.logger.Error("failed to start rebalance triggering loop", tag.Error(err))
 		return
 	}
@@ -201,43 +202,46 @@ func (p *namespaceProcessor) runRebalancingLoop(ctx context.Context) {
 	for {
 		select {
 		case <-ctx.Done():
-			p.logger.Info("Rebalancing loop cancelled.")
+			p.logger.Info("Rebalancing loop cancelled")
 			return
 
-		case update := <-updateChan:
+		case triggerReason := <-triggerChan:
 			// If an update comes in before the cooldown has expired,
 			// we wait until the cooldown has passed since the last rebalance before processing it.
 			// This ensures that we don't rebalance too frequently in response to a flurry of updates
 			p.timeSource.Sleep(nextRebalanceAllowedAt.Sub(p.timeSource.Now()))
 			nextRebalanceAllowedAt = p.timeSource.Now().Add(p.cfg.RebalanceCooldown)
 
-			p.logger.Info("Rebalancing triggered", tag.Dynamic("reason", update))
+			p.logger.Info("Rebalancing triggered", tag.Dynamic("triggerReason", triggerReason))
 			if err := p.rebalanceShards(ctx); err != nil {
 				p.logger.Error("rebalance failed", tag.Error(err))
+
+				// If rebalance fails, we want to trigger another rebalance ASAP,
+				// but with a cooldown to avoid rebalance storms if the underlying issue is persistent.
+				select {
+				case triggerChan <- "Previous rebalance failed":
+				default:
+					// If the channel is full, we skip sending the update to avoid blocking the loop.
+				}
 			}
 		}
 	}
 }
 
 // runRebalanceTriggeringLoop monitors for state changes and periodic triggers to initiate rebalancing.
 // it doesn't block Subscribe calls to avoid a growing backlog of updates.
-func (p *namespaceProcessor) runRebalanceTriggeringLoop(ctx context.Context) (<-chan string, error) {
-	// Buffered channel to allow one pending rebalance trigger.
-	triggerChan := make(chan string, 1)
-
+func (p *namespaceProcessor) runRebalanceTriggeringLoop(ctx context.Context, triggerChan chan<- string) error {
 	updateChan, err := p.shardStore.SubscribeToExecutorStatusChanges(ctx, p.namespaceCfg.Name)
 	if err != nil {
 		p.logger.Error("Failed to subscribe to state changes, stopping rebalancing loop.", tag.Error(err))
-		return nil, err
+		return err
 	}
 
 	go p.rebalanceTriggeringLoop(ctx, updateChan, triggerChan)
-	return triggerChan, nil
+	return nil
 }
 
 func (p *namespaceProcessor) rebalanceTriggeringLoop(ctx context.Context, updateChan <-chan int64, triggerChan chan<- string) {
-	defer close(triggerChan)
-
 	ticker := p.timeSource.NewTicker(p.cfg.Period)
 	defer ticker.Stop()