fix(drain): cancel stale drains on spec revert

fernando-villalba · fernando-villalba · commit be8c1fcf332d · 2026-03-08T20:13:34.000Z
When a scale-down or rolling update is reversed before the drain
  completes, pods stay stuck with drain annotations indefinitely,
  causing the Shard to report Progressing even though all pods are
  within the desired state.

  - Add clearDrainAnnotations helper in drain_helpers.go
  - Add isDrainStale check in reconcileDrainState before running
    ExecuteDrainStateMachine — only cancels at Requested state
    where no standby removal RPC has been sent yet
  - Guard against cancelling drains on: Draining/Acknowledged/
    ReadyForDeletion states, deleting pods, DRAINED pods, extra
    pods beyond replica count, and spec-drifted pods
  - Add TestIsDrainStale with 7 cases covering all guard paths

  Prevents pods from getting stuck in drain limbo when the desired
  state reverts, allowing the Shard to recover to Healthy.
diff --git a/pkg/resource-handler/controller/shard/drain_helpers.go b/pkg/resource-handler/controller/shard/drain_helpers.go
@@ -31,6 +31,18 @@ func resolvePodRole(shard *multigresv1alpha1.Shard, podName string) string {
 	return ""
 }
 
+// clearDrainAnnotations removes all drain annotations from a pod via merge patch,
+// cancelling a drain that is no longer needed (e.g. scale-down reversed).
+func clearDrainAnnotations(ctx context.Context, k8sClient client.Client, pod *corev1.Pod) error {
+	patch := client.MergeFrom(pod.DeepCopy())
+	delete(pod.Annotations, metadata.AnnotationDrainState)
+	delete(pod.Annotations, metadata.AnnotationDrainRequestedAt)
+	if err := k8sClient.Patch(ctx, pod, patch); err != nil {
+		return fmt.Errorf("failed to clear drain annotations for pod %s: %w", pod.Name, err)
+	}
+	return nil
+}
+
 // initiateDrain sets the drain-requested annotation on a pod via merge patch,
 // starting the drain state machine: the reconciler removes the pod from the
 // sync standby list, unregisters it from etcd, then marks it ready-for-deletion.
diff --git a/pkg/resource-handler/controller/shard/reconcile_data_plane.go b/pkg/resource-handler/controller/shard/reconcile_data_plane.go
@@ -182,22 +182,88 @@ func (r *ShardReconciler) reconcileDrainState(
 	requeue := false
 	for i := range podList.Items {
 		pod := &podList.Items[i]
-		if pod.Annotations[metadata.AnnotationDrainState] != "" {
-			shouldRequeue, derr := drain.ExecuteDrainStateMachine(
-				ctx, r.Client, r.RPCClient, r.Recorder, store, shard, pod,
-			)
-			if derr != nil {
-				logger.Error(derr, "Failed to execute drain state machine", "pod", pod.Name)
-			}
-			if shouldRequeue {
-				requeue = true
+		state := pod.Annotations[metadata.AnnotationDrainState]
+		if state == "" {
+			continue
+		}
+
+		if r.isDrainStale(shard, pod, state) {
+			logger.Info("Cancelling stale drain: pod is within desired replicas and spec matches",
+				"pod", pod.Name, "state", state)
+			if err := clearDrainAnnotations(ctx, r.Client, pod); err != nil {
+				logger.Error(err, "Failed to clear drain annotations", "pod", pod.Name)
 			}
+			r.Recorder.Eventf(shard, "Normal", "DrainCancelled",
+				"Cancelled stale drain on pod %s (now within desired state)", pod.Name)
+			continue
+		}
+
+		shouldRequeue, derr := drain.ExecuteDrainStateMachine(
+			ctx, r.Client, r.RPCClient, r.Recorder, store, shard, pod,
+		)
+		if derr != nil {
+			logger.Error(derr, "Failed to execute drain state machine", "pod", pod.Name)
+		}
+		if shouldRequeue {
+			requeue = true
 		}
 	}
 
 	return requeue, nil
 }
 
+// isDrainStale returns true when a pod's drain is no longer needed because the
+// desired state has changed (e.g. scale-down reversed or rolling-update reverted).
+// Only early drain states (Requested/Draining) are cancellable — once Acknowledged,
+// etcd unregistration may have started and the drain must complete.
+func (r *ShardReconciler) isDrainStale(
+	shard *multigresv1alpha1.Shard,
+	pod *corev1.Pod,
+	state string,
+) bool {
+	// Only cancel Requested — nothing has happened yet at this point.
+	// Draining means the standby removal RPC already succeeded and the pod
+	// has been removed from the sync standby list; cancelling there would
+	// leave an orphaned replica unless multiorch re-registers it.
+	if state != metadata.DrainStateRequested {
+		return false
+	}
+
+	// Pods being deleted need the drain to cleanly unregister from etcd.
+	if !pod.DeletionTimestamp.IsZero() {
+		return false
+	}
+
+	// DRAINED pods need replacement regardless of replica count or spec match.
+	if resolvePodRole(shard, pod.Name) == "DRAINED" {
+		return false
+	}
+
+	poolName := pod.Labels[metadata.LabelMultigresPool]
+	cellName := pod.Labels[metadata.LabelMultigresCell]
+	if poolName == "" || cellName == "" {
+		return false
+	}
+
+	poolSpec, ok := shard.Spec.Pools[multigresv1alpha1.PoolName(poolName)]
+	if !ok {
+		return false
+	}
+
+	replicas := DefaultPoolReplicas
+	if poolSpec.ReplicasPerCell != nil {
+		replicas = *poolSpec.ReplicasPerCell
+	}
+
+	index := resolvePodIndex(pod.Name)
+	if index < 0 || index >= int(replicas) {
+		return false // Pod is still an extra pod for scale-down
+	}
+
+	// Pod is within replica range — check if its spec still matches desired.
+	return !podNeedsUpdate(pod, shard, poolName, cellName, poolSpec, index, r.Scheme)
+}
+
 // getTopoStore returns a topology store, using the custom factory if set, otherwise the default.
 func (r *ShardReconciler) getTopoStore(shard *multigresv1alpha1.Shard) (topoclient.Store, error) {
 	if r.CreateTopoStore != nil {
diff --git a/pkg/resource-handler/controller/shard/shard_controller_internal_test.go b/pkg/resource-handler/controller/shard/shard_controller_internal_test.go
@@ -4411,6 +4411,118 @@ func TestUpdatePoolsStatus_TerminatingPodExcluded(t *testing.T) {
 	}
 }
 
+func TestIsDrainStale(t *testing.T) {
+	scheme := runtime.NewScheme()
+	_ = multigresv1alpha1.AddToScheme(scheme)
+	_ = corev1.AddToScheme(scheme)
+
+	shard := &multigresv1alpha1.Shard{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "test-shard-stale",
+			Namespace: "default",
+			Labels: map[string]string{
+				metadata.LabelMultigresCluster: "test-cluster",
+			},
+		},
+		Spec: multigresv1alpha1.ShardSpec{
+			DatabaseName:   "db",
+			TableGroupName: "tg",
+			ShardName:      "s1",
+			Pools: map[multigresv1alpha1.PoolName]multigresv1alpha1.PoolSpec{
+				"main": {
+					Cells:           []multigresv1alpha1.CellName{"z1"},
+					ReplicasPerCell: ptr.To(int32(5)),
+				},
+			},
+		},
+	}
+
+	r := &ShardReconciler{Scheme: scheme}
+
+	// Build a pod with matching spec-hash for index 4
+	matchingPod := func(index int, drainState string) *corev1.Pod {
+		desired, err := BuildPoolPod(shard, "main", "z1", shard.Spec.Pools["main"], index, scheme)
+		if err != nil {
+			t.Fatalf("BuildPoolPod failed: %v", err)
+		}
+		hash := ComputeSpecHash(desired)
+		return &corev1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      BuildPoolPodName(shard, "main", "z1", index),
+				Namespace: "default",
+				Labels:    buildPoolLabelsWithCell(shard, "main", "z1"),
+				Annotations: map[string]string{
+					metadata.AnnotationSpecHash:        hash,
+					metadata.AnnotationDrainState:      drainState,
+					metadata.AnnotationDrainRequestedAt: "2026-03-08T18:00:00Z",
+				},
+			},
+		}
+	}
+
+	t.Run("CancelsStaleScaleDownDrain", func(t *testing.T) {
+		pod := matchingPod(4, metadata.DrainStateRequested)
+		if !r.isDrainStale(shard, pod, metadata.DrainStateRequested) {
+			t.Error("expected drain to be stale (pod within replicas, spec matches)")
+		}
+	})
+
+	t.Run("DoesNotCancelDrainingState", func(t *testing.T) {
+		pod := matchingPod(4, metadata.DrainStateDraining)
+		if r.isDrainStale(shard, pod, metadata.DrainStateDraining) {
+			t.Error("expected drain NOT to be stale in Draining state (standby removal already sent)")
+		}
+	})
+
+	t.Run("DoesNotCancelExtraPodDrain", func(t *testing.T) {
+		// Reduce replicas so pod-4 (index 4) is extra
+		smallShard := shard.DeepCopy()
+		smallShard.Spec.Pools["main"] = multigresv1alpha1.PoolSpec{
+			Cells:           []multigresv1alpha1.CellName{"z1"},
+			ReplicasPerCell: ptr.To(int32(4)),
+		}
+		pod := matchingPod(4, metadata.DrainStateRequested)
+		if r.isDrainStale(smallShard, pod, metadata.DrainStateRequested) {
+			t.Error("expected drain NOT to be stale (pod is extra)")
+		}
+	})
+
+	t.Run("DoesNotCancelAcknowledgedDrain", func(t *testing.T) {
+		pod := matchingPod(4, metadata.DrainStateAcknowledged)
+		if r.isDrainStale(shard, pod, metadata.DrainStateAcknowledged) {
+			t.Error("expected drain NOT to be stale (past point of no return)")
+		}
+	})
+
+	t.Run("DoesNotCancelDrainedPodDrain", func(t *testing.T) {
+		shardWithDrained := shard.DeepCopy()
+		pod := matchingPod(0, metadata.DrainStateRequested)
+		shardWithDrained.Status.PodRoles = map[string]string{
+			pod.Name: "DRAINED",
+		}
+		if r.isDrainStale(shardWithDrained, pod, metadata.DrainStateRequested) {
+			t.Error("expected drain NOT to be stale (pod role is DRAINED)")
+		}
+	})
+
+	t.Run("DoesNotCancelDrainOnDeletingPod", func(t *testing.T) {
+		pod := matchingPod(4, metadata.DrainStateRequested)
+		now := metav1.Now()
+		pod.DeletionTimestamp = &now
+		if r.isDrainStale(shard, pod, metadata.DrainStateRequested) {
+			t.Error("expected drain NOT to be stale (pod is being deleted)")
+		}
+	})
+
+	t.Run("DoesNotCancelWhenSpecDrifted", func(t *testing.T) {
+		pod := matchingPod(4, metadata.DrainStateRequested)
+		pod.Annotations[metadata.AnnotationSpecHash] = "wrong-hash"
+		if r.isDrainStale(shard, pod, metadata.DrainStateRequested) {
+			t.Error("expected drain NOT to be stale (spec-hash mismatch)")
+		}
+	})
+}
+
 func TestUpdatePoolsStatus_DrainAnnotationExcludedFromReady(t *testing.T) {
 	scheme := runtime.NewScheme()
 	_ = multigresv1alpha1.AddToScheme(scheme)