@@ -213,9 +213,27 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
213213 }
214214
215215 // Check if deployment is being updated or rolling out
216- if deployment .Status .UnavailableReplicas > 0 ||
217- deployment .Status .UpdatedReplicas < deployment .Status .Replicas {
218- a .logger .Debugf ("Deployment %s has unavailable replicas, likely due to pod disruption" , deploymentSpec .Name )
216+ // This includes several scenarios:
217+ // 1. UnavailableReplicas > 0: Some replicas are not ready
218+ // 2. UpdatedReplicas < Replicas: Rollout in progress
219+ // 3. Generation != ObservedGeneration: Spec changed but not yet observed
220+ // 4. AvailableReplicas < desired: Not all replicas are available yet
221+ desiredReplicas := int32 (1 )
222+ if deployment .Spec .Replicas != nil {
223+ desiredReplicas = * deployment .Spec .Replicas
224+ }
225+ isRollingOut := deployment .Status .UnavailableReplicas > 0 ||
226+ deployment .Status .UpdatedReplicas < deployment .Status .Replicas ||
227+ deployment .Generation != deployment .Status .ObservedGeneration ||
228+ deployment .Status .AvailableReplicas < desiredReplicas
229+
230+ if isRollingOut {
231+ a .logger .Debugf ("Deployment %s is rolling out or has unavailable replicas (unavailable=%d, updated=%d/%d, available=%d/%d, generation=%d/%d), likely due to pod disruption" ,
232+ deploymentSpec .Name ,
233+ deployment .Status .UnavailableReplicas ,
234+ deployment .Status .UpdatedReplicas , deployment .Status .Replicas ,
235+ deployment .Status .AvailableReplicas , desiredReplicas ,
236+ deployment .Status .ObservedGeneration , deployment .Generation )
219237
220238 // Check pod status to confirm disruption
221239 selector , err := metav1 .LabelSelectorAsSelector (deployment .Spec .Selector )
@@ -230,6 +248,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
230248 continue
231249 }
232250
251+ // For single-replica deployments during rollout, if no pods exist yet,
252+ // this is likely the brief window where the old pod is gone and new pod
253+ // hasn't been created yet. This is expected disruption during upgrade.
254+ // According to the OpenShift contract: "A component must not report Available=False
255+ // during the course of a normal upgrade."
256+ if len (pods ) == 0 && desiredReplicas == 1 && isRollingOut {
257+ a .logger .Infof ("Single-replica deployment %s is rolling out with no pods yet - expected disruption during upgrade, will not mark CSV as Failed" , deploymentSpec .Name )
258+ return true
259+ }
260+
261+ // Track if we found any real failures or expected disruptions
262+ foundExpectedDisruption := false
263+ foundRealFailure := false
264+
233265 // Check if any pod is in expected disruption state
234266 for _ , pod := range pods {
235267 // Check how long the pod has been in disrupted state
@@ -244,7 +276,8 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
244276 // Pod is terminating (DeletionTimestamp is set)
245277 if pod .DeletionTimestamp != nil {
246278 a .logger .Debugf ("Pod %s is terminating - expected disruption" , pod .Name )
247- return true
279+ foundExpectedDisruption = true
280+ continue
248281 }
249282
250283 // For pending pods, we need to distinguish between expected disruption
@@ -257,11 +290,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
257290 // Check pod conditions for scheduling issues
258291 for _ , condition := range pod .Status .Conditions {
259292 if condition .Type == corev1 .PodScheduled && condition .Status == corev1 .ConditionFalse {
260- // If pod has been unschedulable for a while, it's likely a real issue
261- // not a temporary disruption from cluster upgrade
262293 if condition .Reason == "Unschedulable" {
263- isRealFailure = true
264- a .logger .Debugf ("Pod %s is unschedulable - not a temporary disruption" , pod .Name )
294+ // CRITICAL: In single-replica deployments during rollout, Unschedulable is EXPECTED
295+ // due to PodAntiAffinity preventing new pod from scheduling while old pod is terminating.
296+ // This is especially common in single-node clusters or control plane scenarios.
297+ // Per OpenShift contract: "A component must not report Available=False during normal upgrade."
298+ if desiredReplicas == 1 && isRollingOut {
299+ a .logger .Infof ("Pod %s is unschedulable during single-replica rollout - likely PodAntiAffinity conflict, treating as expected disruption" , pod .Name )
300+ isExpectedDisruption = true
301+ } else {
302+ // Multi-replica or non-rollout Unschedulable is a real issue
303+ isRealFailure = true
304+ foundRealFailure = true
305+ a .logger .Debugf ("Pod %s is unschedulable - not a temporary disruption" , pod .Name )
306+ }
265307 break
266308 }
267309 }
@@ -278,6 +320,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
278320 case "ImagePullBackOff" , "ErrImagePull" , "CrashLoopBackOff" , "CreateContainerConfigError" , "InvalidImageName" :
279321 // These are real failures, not temporary disruptions
280322 isRealFailure = true
323+ foundRealFailure = true
281324 a .logger .Debugf ("Pod %s has container error %s - real failure, not disruption" , pod .Name , reason )
282325 }
283326 }
@@ -292,6 +335,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
292335 isExpectedDisruption = true
293336 case "ImagePullBackOff" , "ErrImagePull" , "CrashLoopBackOff" , "CreateContainerConfigError" , "InvalidImageName" :
294337 isRealFailure = true
338+ foundRealFailure = true
295339 a .logger .Debugf ("Pod %s has init container error %s - real failure, not disruption" , pod .Name , reason )
296340 }
297341 }
@@ -302,17 +346,19 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
302346 continue
303347 }
304348
305- // If it's in expected disruption state, return true
349+ // If it's in expected disruption state, mark it
306350 if isExpectedDisruption {
307351 a .logger .Debugf ("Pod %s is in expected disruption state" , pod .Name )
308- return true
352+ foundExpectedDisruption = true
353+ continue
309354 }
310355
311356 // If pending without clear container status, check if it's just being scheduled
312357 // This could be normal pod creation during node drain
313358 if len (pod .Status .ContainerStatuses ) == 0 && len (pod .Status .InitContainerStatuses ) == 0 {
314359 a .logger .Debugf ("Pod %s is pending without container statuses - likely being scheduled" , pod .Name )
315- return true
360+ foundExpectedDisruption = true
361+ continue
316362 }
317363 }
318364
@@ -323,14 +369,85 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
323369 switch reason {
324370 case "ContainerCreating" , "PodInitializing" :
325371 a .logger .Debugf ("Pod %s container is starting - expected disruption" , pod .Name )
326- return true
372+ foundExpectedDisruption = true
327373 case "ImagePullBackOff" , "ErrImagePull" , "CrashLoopBackOff" :
328374 // Real failures - don't treat as disruption
329375 a .logger .Debugf ("Pod %s has container error %s - not treating as disruption" , pod .Name , reason )
376+ foundRealFailure = true
330377 }
331378 }
332379 }
333380 }
381+
382+ // After checking all pods, make a decision
383+ // If we found expected disruption and no real failures, treat as disruption
384+ if foundExpectedDisruption && ! foundRealFailure {
385+ a .logger .Infof ("Deployment %s has pods in expected disruption state - will not mark CSV as Failed per Available contract" , deploymentSpec .Name )
386+ return true
387+ }
388+
389+ // For single-replica deployments during rollout, if we found no real failures,
390+ // treat as expected disruption to comply with the OpenShift contract:
391+ // "A component must not report Available=False during the course of a normal upgrade."
392+ // Single-replica deployments inherently have unavailability during rollout,
393+ // but this is acceptable and should not trigger Available=False.
394+ if ! foundRealFailure && desiredReplicas == 1 && isRollingOut {
395+ a .logger .Infof ("Single-replica deployment %s is rolling out - treating as expected disruption per Available contract" , deploymentSpec .Name )
396+ return true
397+ }
398+ }
399+ }
400+
401+ return false
402+ }
403+
404+ // hasAnyDeploymentInRollout checks if any deployment in the CSV is currently rolling out
405+ // or has unavailable replicas. This is used to detect potential lister cache sync issues
406+ // during SNO upgrades where the informer cache may not have synced yet after node reboot.
407+ func (a * Operator ) hasAnyDeploymentInRollout (csv * v1alpha1.ClusterServiceVersion ) bool {
408+ strategy , err := a .resolver .UnmarshalStrategy (csv .Spec .InstallStrategy )
409+ if err != nil {
410+ a .logger .Debugf ("Unable to unmarshal strategy for CSV %s: %v" , csv .Name , err )
411+ return false
412+ }
413+
414+ strategyDetailsDeployment , ok := strategy .(* v1alpha1.StrategyDetailsDeployment )
415+ if ! ok {
416+ a .logger .Debugf ("CSV %s does not use deployment strategy" , csv .Name )
417+ return false
418+ }
419+
420+ for _ , deploymentSpec := range strategyDetailsDeployment .DeploymentSpecs {
421+ deployment , err := a .lister .AppsV1 ().DeploymentLister ().Deployments (csv .Namespace ).Get (deploymentSpec .Name )
422+ if err != nil {
423+ // If deployment is not found, it could be:
424+ // 1. A real error (deployment never existed or was deleted)
425+ // 2. Cache sync issue after node reboot
426+ // We can't distinguish between these cases here, so we don't assume rollout.
427+ // The caller should handle this case appropriately.
428+ a .logger .Debugf ("Error getting deployment %s: %v" , deploymentSpec .Name , err )
429+ continue
430+ }
431+
432+ // Check if deployment is rolling out or has unavailable replicas
433+ // This covers various scenarios during cluster upgrades
434+ isRollingOut := deployment .Status .UnavailableReplicas > 0 ||
435+ deployment .Status .UpdatedReplicas < deployment .Status .Replicas ||
436+ deployment .Generation != deployment .Status .ObservedGeneration
437+
438+ // Also check if available replicas are less than desired (for single-replica case)
439+ if deployment .Spec .Replicas != nil {
440+ isRollingOut = isRollingOut || deployment .Status .AvailableReplicas < * deployment .Spec .Replicas
441+ }
442+
443+ if isRollingOut {
444+ a .logger .Debugf ("Deployment %s is rolling out (unavailable=%d, updated=%d/%d, available=%d, generation=%d/%d)" ,
445+ deploymentSpec .Name ,
446+ deployment .Status .UnavailableReplicas ,
447+ deployment .Status .UpdatedReplicas , deployment .Status .Replicas ,
448+ deployment .Status .AvailableReplicas ,
449+ deployment .Status .ObservedGeneration , deployment .Generation )
450+ return true
334451 }
335452 }
336453
@@ -352,12 +469,19 @@ func (a *Operator) areAPIServicesAvailable(csv *v1alpha1.ClusterServiceVersion)
352469 if ! install .IsAPIServiceAvailable (apiService ) {
353470 a .logger .Debugf ("APIService not available for %s" , desc .GetName ())
354471
355- // Check if this unavailability is due to expected pod disruption
356- // If so, we should not immediately mark as failed or trigger Progressing=True
357- if a .isAPIServiceBackendDisrupted (csv , desc .GetName ()) {
358- a .logger .Infof ("APIService %s unavailable due to pod disruption (e.g., node reboot), will retry" , desc .GetName ())
359- // Return an error to trigger retry, but don't mark as definitively unavailable
360- return false , olmerrors .NewRetryableError (fmt .Errorf ("APIService %s temporarily unavailable due to pod disruption" , desc .GetName ()))
472+ // Only check for pod disruption when CSV is in Succeeded phase.
473+ // During Installing phase, deployment rollout is expected and should not
474+ // trigger the disruption detection logic.
475+ // The disruption detection is specifically for cluster upgrades/reboots
476+ // when a previously healthy CSV becomes temporarily unavailable.
477+ if csv .Status .Phase == v1alpha1 .CSVPhaseSucceeded {
478+ // Check if this unavailability is due to expected pod disruption
479+ // If so, we should not immediately mark as failed or trigger Progressing=True
480+ if a .isAPIServiceBackendDisrupted (csv , desc .GetName ()) {
481+ a .logger .Infof ("APIService %s unavailable due to pod disruption (e.g., node reboot), will retry" , desc .GetName ())
482+ // Return an error to trigger retry, but don't mark as definitively unavailable
483+ return false , olmerrors .NewRetryableError (fmt .Errorf ("APIService %s temporarily unavailable due to pod disruption" , desc .GetName ()))
484+ }
361485 }
362486
363487 return false , nil
0 commit comments