Skip to content

Commit 829cbee

Browse files
committed
PVCUnbinder: Restore and flag claimRef clearing
This reverts commit a89e202 and adds the CLI flag `--allow-pv-rebinding` to opt into the un-reverted functionality. A recent incident hit an exceptional edge case where all redpanda Pods where temporarily stuck in "Pending". Their underlying Nodes were marked as "NotReady" which caused the TaintManager to evict said Pods and prevented them from being rescheduled. The unbinder functioned as expected but due to the reverted commit the still existing PVs were not eligible for rebinding. As a result, a new cluster was formed. The original motivation for the reverted commit was due to an edge case where a Node's name was reused in EKS due to using Node IPs as their name. Our cloud has since begun using the instance ID as the Node name. To remain on the conservative side, we've opted to make this functionality opt in.
1 parent 4c6ebd4 commit 829cbee

File tree

2 files changed

+71
-12
lines changed

2 files changed

+71
-12
lines changed

operator/cmd/run/run.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ func Command() *cobra.Command {
149149
ghostbuster bool
150150
unbindPVCsAfter time.Duration
151151
unbinderSelector LabelSelectorValue
152+
allowPVRebinding bool
152153
autoDeletePVCs bool
153154
webhookCertPath string
154155
webhookCertName string
@@ -213,6 +214,7 @@ func Command() *cobra.Command {
213214
ghostbuster,
214215
unbindPVCsAfter,
215216
unbinderSelector.Selector,
217+
allowPVRebinding,
216218
autoDeletePVCs,
217219
pprofAddr,
218220
webhookCertPath,
@@ -263,6 +265,7 @@ func Command() *cobra.Command {
263265
cmd.Flags().StringSliceVar(&additionalControllers, "additional-controllers", []string{""}, fmt.Sprintf("which controllers to run, available: all, %s", strings.Join(availableControllers, ", ")))
264266
cmd.Flags().BoolVar(&operatorMode, "operator-mode", true, "enables to run as an operator, setting this to false will disable cluster (deprecated), redpanda resources reconciliation.")
265267
cmd.Flags().DurationVar(&unbindPVCsAfter, "unbind-pvcs-after", 0, "if not zero, runs the PVCUnbinder controller which attempts to 'unbind' the PVCs' of Pods that are Pending for longer than the given duration")
268+
cmd.Flags().BoolVar(&allowPVRebinding, "allow-pv-rebinding", false, "controls whether or not PVs unbound by the PVCUnbinder have their .ClaimRef cleared, which allows them to be reused")
266269
cmd.Flags().Var(&unbinderSelector, "unbinder-label-selector", "if provided, a Kubernetes label selector that will filter Pods to be considered by the PVCUnbinder.")
267270
cmd.Flags().BoolVar(&autoDeletePVCs, "auto-delete-pvcs", false, "Use StatefulSet PersistentVolumeClaimRetentionPolicy to auto delete PVCs on scale down and Cluster resource delete.")
268271
cmd.Flags().StringVar(&webhookCertPath, "webhook-cert-path", "", "The directory that contains the webhook certificate.")
@@ -329,6 +332,7 @@ func Run(
329332
ghostbuster bool,
330333
unbindPVCsAfter time.Duration,
331334
unbinderSelector labels.Selector,
335+
allowPVRebinding bool,
332336
autoDeletePVCs bool,
333337
pprofAddr string,
334338
webhookCertPath string,
@@ -684,14 +688,15 @@ func Run(
684688

685689
// The unbinder gets to run in any mode, if it's enabled.
686690
if unbindPVCsAfter <= 0 {
687-
setupLog.Info("PVCUnbinder controller not active", "unbind-after", unbindPVCsAfter, "selector", unbinderSelector)
691+
setupLog.Info("PVCUnbinder controller not active", "unbind-after", unbindPVCsAfter, "selector", unbinderSelector, "allow-pv-rebinding", allowPVRebinding)
688692
} else {
689-
setupLog.Info("starting PVCUnbinder controller", "unbind-after", unbindPVCsAfter, "selector", unbinderSelector)
693+
setupLog.Info("starting PVCUnbinder controller", "unbind-after", unbindPVCsAfter, "selector", unbinderSelector, "allow-pv-rebinding", allowPVRebinding)
690694

691695
if err := (&pvcunbinder.Controller{
692-
Client: mgr.GetClient(),
693-
Timeout: unbindPVCsAfter,
694-
Selector: unbinderSelector,
696+
Client: mgr.GetClient(),
697+
Timeout: unbindPVCsAfter,
698+
Selector: unbinderSelector,
699+
AllowRebinding: allowPVRebinding,
695700
}).SetupWithManager(mgr); err != nil {
696701
setupLog.Error(err, "unable to create controller", "controller", "PVCUnbinder")
697702
return err

operator/internal/controller/pvcunbinder/pvcunbinder.go

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ package pvcunbinder
1111

1212
import (
1313
"context"
14+
"fmt"
1415
"regexp"
1516
"slices"
1617
"strings"
@@ -44,7 +45,11 @@ var schedulingFailureRE = regexp.MustCompile(`(^0/[1-9]\d* nodes are available)|
4445
// 2. Ensure that all PVs in question have a Retain policy
4546
// 3. Delete all PVCs from step 1. (PVCs are immutable after creation,
4647
// deletion is the only option)
47-
// 4. Deleting the Pod to re-trigger PVC creation and rebinding.
48+
// 4. (Optionally) "Recycle" all PVs from step 1 by clearing the ClaimRef.
49+
// Kubernetes will only consider binding PVs that have a satisfiable
50+
// NodeAffinity. By "recycling" we permit Flakey Nodes to rejoin the cluster
51+
// which _might_ reclaim the now freed volume.
52+
// 5. Deleting the Pod to re-trigger PVC creation and rebinding.
4853
type Controller struct {
4954
Client client.Client
5055
// Timeout is the duration a Pod must be stuck in Pending before
@@ -57,6 +62,15 @@ type Controller struct {
5762
// Reconciler will consider for remediation via some sort of filtering
5863
// function.
5964
Filter func(ctx context.Context, pod *corev1.Pod) (bool, error)
65+
// AllowRebinding optionally enables clearing of the unbound PV's ClaimRef
66+
// which effectively makes the PVs "re-bindable" if the underlying Node
67+
// become capable of scheduling Pods once again.
68+
// NOTE: This option can present problems when a Node's name is reused and
69+
// using HostPath volumes and LocalPathProvisioner. In such a case, the
70+
// helper Pod of LocalPathProvisioner will NOT run a second time as the
71+
// Volume is assumed to exist. This can lead to Permission errors or
72+
// referencing a directory that does not exist.
73+
AllowRebinding bool
6074
}
6175

6276
func FilterPodOwner(ownerNamespace, ownerName string) func(ctx context.Context, pod *corev1.Pod) (bool, error) {
@@ -130,9 +144,9 @@ func (r *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
130144
return ctrl.Result{}, err
131145
}
132146

133-
// Filter PVs down to ones that are:
134-
// 1. Bound to a PVC we care about.
135-
// 2. Have a NodeAffinity (which we assume is the cause of our Pod being in Pending)
147+
// 1. Filter PVs down to ones that are:
148+
// - Bound to a PVC we care about.
149+
// - Have a NodeAffinity (which we assume is the cause of our Pod being in Pending)
136150
var pvs []*corev1.PersistentVolume
137151
for i := range pvList.Items {
138152
pv := &pvList.Items[i]
@@ -161,14 +175,14 @@ func (r *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
161175
pvs = append(pvs, pv)
162176
}
163177

164-
// 3. Ensure that all PVs have reclaim set to Retain
178+
// 2. Ensure that all PVs have reclaim set to Retain
165179
for _, pv := range pvs {
166180
if err := r.ensureRetainPolicy(ctx, pv); err != nil {
167181
return ctrl.Result{}, err
168182
}
169183
}
170184

171-
// 4. Delete all Bound PVCs
185+
// 3. Delete all Bound PVCs
172186
for key, pvc := range pvcByKey {
173187
if pvc == nil || pvc.Spec.VolumeName == "" {
174188
continue
@@ -188,6 +202,14 @@ func (r *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
188202
pvcByKey[key] = nil
189203
}
190204

205+
// 4. "Recycle" PVs that have been released. Technically optional, this
206+
// allows disks to rebind if a Node happens to recover.
207+
for _, pv := range pvs {
208+
if err := r.maybeRecyclePersistentVolume(ctx, pv); err != nil {
209+
return ctrl.Result{}, err
210+
}
211+
}
212+
191213
missingPVCs := false
192214
for _, pvc := range pvcByKey {
193215
if pvc == nil {
@@ -196,7 +218,7 @@ func (r *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
196218
}
197219
}
198220

199-
// 5. Delete the Pod to cause the StatefulSet controller to re-create both
221+
// 6. Delete the Pod to cause the StatefulSet controller to re-create both
200222
// the PVCs and the Pod but only if there are missing PVCs.
201223
if !missingPVCs {
202224
logger.Info("not deleting Pod; no PVCs were deleted", "name", pod.Name)
@@ -231,6 +253,38 @@ func (r *Controller) ensureRetainPolicy(ctx context.Context, pv *corev1.Persiste
231253
return nil
232254
}
233255

256+
// maybeRecyclePersistentVolume "recycles" a released PV by clearing it's .ClaimRef
257+
// which makes it available for binding once again IF AllowRebinding is true.
258+
// This strategy is only valid for volumes that utilize .HostPath or .Local.
259+
func (r *Controller) maybeRecyclePersistentVolume(ctx context.Context, pv *corev1.PersistentVolume) error {
260+
if pv.Spec.HostPath == nil && pv.Spec.Local == nil {
261+
return fmt.Errorf("%T must specify .Spec.HostPath or .Spec.Local for recycling: %q", pv, pv.Name)
262+
}
263+
264+
// NB: We handle this flag here to ensure we get the
265+
if !r.AllowRebinding {
266+
log.FromContext(ctx).Info("Skipping .ClaimRef clearing of PersistentVolume", "name", pv.Name, "AllowRebinding", r.AllowRebinding)
267+
return nil
268+
}
269+
270+
// Skip over unbound PVs.
271+
if pv.Spec.ClaimRef == nil {
272+
return nil
273+
}
274+
275+
log.FromContext(ctx).Info("Clearing .ClaimRef of PersistentVolume", "name", pv.Name, "AllowRebinding", r.AllowRebinding)
276+
277+
// NB: We explicitly don't use an optimistic lock here as the control plane
278+
// will likely have updated this PV's Status to indicate that it's now
279+
// Released.
280+
patch := client.StrategicMergeFrom(pv.DeepCopy())
281+
pv.Spec.ClaimRef = nil
282+
if err := r.Client.Patch(ctx, pv, patch); err != nil {
283+
return err
284+
}
285+
return nil
286+
}
287+
234288
func (r *Controller) ShouldRemediate(ctx context.Context, pod *corev1.Pod) (bool, time.Duration) {
235289
if r.Selector != nil && !r.Selector.Matches(labels.Set(pod.Labels)) {
236290
log.FromContext(ctx).Info("selector not satisfied; skipping", "name", pod.Name, "labels", pod.Labels, "selector", r.Selector.String())

0 commit comments

Comments
 (0)