@@ -45,13 +45,12 @@ const (
45
45
// This situation should self-correct because the PDB controller removes
46
46
// entries from the map automatically after the PDB DeletionTimeout regardless.
47
47
MaxDisruptedPodSize = 2000
48
- retrySteps = 20
49
48
)
50
49
51
50
// EvictionsRetry is the retry for a conflict where multiple clients
52
51
// are making changes to the same resource.
53
52
var EvictionsRetry = wait.Backoff {
54
- Steps : retrySteps ,
53
+ Steps : 20 ,
55
54
Duration : 500 * time .Millisecond ,
56
55
Factor : 1.0 ,
57
56
Jitter : 0.1 ,
@@ -124,7 +123,14 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
124
123
125
124
var pod * api.Pod
126
125
deletedPod := false
127
- err = retry .RetryOnConflict (EvictionsRetry , func () error {
126
+ // by default, retry conflict errors
127
+ shouldRetry := errors .IsConflict
128
+ if ! resourceVersionIsUnset (originalDeleteOptions ) {
129
+ // if the original options included a resourceVersion precondition, don't retry
130
+ shouldRetry = func (err error ) bool { return false }
131
+ }
132
+
133
+ err = retry .OnError (EvictionsRetry , shouldRetry , func () error {
128
134
obj , err = r .store .Get (ctx , eviction .Name , & metav1.GetOptions {})
129
135
if err != nil {
130
136
return err
@@ -140,6 +146,9 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
140
146
141
147
// the PDB can be ignored, so delete the pod
142
148
deletionOptions := originalDeleteOptions .DeepCopy ()
149
+ // We should check if resourceVersion is already set by the requestor
150
+ // as it might be older than the pod we just fetched and should be
151
+ // honored.
143
152
if shouldEnforceResourceVersion (pod ) && resourceVersionIsUnset (originalDeleteOptions ) {
144
153
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
145
154
// racing with another PDB-impacting process elsewhere.
@@ -158,7 +167,7 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
158
167
switch {
159
168
case err != nil :
160
169
// this can happen in cases where the PDB can be ignored, but there was a problem issuing the pod delete:
161
- // maybe we conflicted two many times or we didn't have permission or something else weird.
170
+ // maybe we conflicted too many times or we didn't have permission or something else weird.
162
171
return nil , err
163
172
164
173
case deletedPod :
@@ -245,11 +254,13 @@ func canIgnorePDB(pod *api.Pod) bool {
245
254
}
246
255
247
256
func shouldEnforceResourceVersion (pod * api.Pod ) bool {
248
- // Only pods that may be included as health in PDBs in the future need to be checked.
249
- if pod .Status .Phase == api .PodPending {
250
- return true
257
+ // We don't need to enforce ResourceVersion for terminal pods
258
+ if pod .Status .Phase == api .PodSucceeded || pod . Status . Phase == api . PodFailed {
259
+ return false
251
260
}
252
- return false
261
+ // Return true for all other pods to ensure we don't race against a pod becoming
262
+ // ready and violating PDBs.
263
+ return true
253
264
}
254
265
255
266
func resourceVersionIsUnset (options * metav1.DeleteOptions ) bool {
0 commit comments