Skip to content

Commit 53d0634

Browse files
committed
Operator operator-lifecycle-manager-packageserver is in Available=False state running in SNO
1 parent 58664cc commit 53d0634

File tree

3 files changed

+392
-19
lines changed

3 files changed

+392
-19
lines changed

pkg/controller/operators/olm/apiservices.go

Lines changed: 142 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -213,9 +213,27 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
213213
}
214214

215215
// Check if deployment is being updated or rolling out
216-
if deployment.Status.UnavailableReplicas > 0 ||
217-
deployment.Status.UpdatedReplicas < deployment.Status.Replicas {
218-
a.logger.Debugf("Deployment %s has unavailable replicas, likely due to pod disruption", deploymentSpec.Name)
216+
// This includes several scenarios:
217+
// 1. UnavailableReplicas > 0: Some replicas are not ready
218+
// 2. UpdatedReplicas < Replicas: Rollout in progress
219+
// 3. Generation != ObservedGeneration: Spec changed but not yet observed
220+
// 4. AvailableReplicas < desired: Not all replicas are available yet
221+
desiredReplicas := int32(1)
222+
if deployment.Spec.Replicas != nil {
223+
desiredReplicas = *deployment.Spec.Replicas
224+
}
225+
isRollingOut := deployment.Status.UnavailableReplicas > 0 ||
226+
deployment.Status.UpdatedReplicas < deployment.Status.Replicas ||
227+
deployment.Generation != deployment.Status.ObservedGeneration ||
228+
deployment.Status.AvailableReplicas < desiredReplicas
229+
230+
if isRollingOut {
231+
a.logger.Debugf("Deployment %s is rolling out or has unavailable replicas (unavailable=%d, updated=%d/%d, available=%d/%d, generation=%d/%d), likely due to pod disruption",
232+
deploymentSpec.Name,
233+
deployment.Status.UnavailableReplicas,
234+
deployment.Status.UpdatedReplicas, deployment.Status.Replicas,
235+
deployment.Status.AvailableReplicas, desiredReplicas,
236+
deployment.Status.ObservedGeneration, deployment.Generation)
219237

220238
// Check pod status to confirm disruption
221239
selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
@@ -230,6 +248,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
230248
continue
231249
}
232250

251+
// For single-replica deployments during rollout, if no pods exist yet,
252+
// this is likely the brief window where the old pod is gone and new pod
253+
// hasn't been created yet. This is expected disruption during upgrade.
254+
// According to the OpenShift contract: "A component must not report Available=False
255+
// during the course of a normal upgrade."
256+
if len(pods) == 0 && desiredReplicas == 1 && isRollingOut {
257+
a.logger.Infof("Single-replica deployment %s is rolling out with no pods yet - expected disruption during upgrade, will not mark CSV as Failed", deploymentSpec.Name)
258+
return true
259+
}
260+
261+
// Track if we found any real failures or expected disruptions
262+
foundExpectedDisruption := false
263+
foundRealFailure := false
264+
233265
// Check if any pod is in expected disruption state
234266
for _, pod := range pods {
235267
// Check how long the pod has been in disrupted state
@@ -244,7 +276,8 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
244276
// Pod is terminating (DeletionTimestamp is set)
245277
if pod.DeletionTimestamp != nil {
246278
a.logger.Debugf("Pod %s is terminating - expected disruption", pod.Name)
247-
return true
279+
foundExpectedDisruption = true
280+
continue
248281
}
249282

250283
// For pending pods, we need to distinguish between expected disruption
@@ -257,11 +290,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
257290
// Check pod conditions for scheduling issues
258291
for _, condition := range pod.Status.Conditions {
259292
if condition.Type == corev1.PodScheduled && condition.Status == corev1.ConditionFalse {
260-
// If pod has been unschedulable for a while, it's likely a real issue
261-
// not a temporary disruption from cluster upgrade
262293
if condition.Reason == "Unschedulable" {
263-
isRealFailure = true
264-
a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
294+
// CRITICAL: In single-replica deployments during rollout, Unschedulable is EXPECTED
295+
// due to PodAntiAffinity preventing new pod from scheduling while old pod is terminating.
296+
// This is especially common in single-node clusters or control plane scenarios.
297+
// Per OpenShift contract: "A component must not report Available=False during normal upgrade."
298+
if desiredReplicas == 1 && isRollingOut {
299+
a.logger.Infof("Pod %s is unschedulable during single-replica rollout - likely PodAntiAffinity conflict, treating as expected disruption", pod.Name)
300+
isExpectedDisruption = true
301+
} else {
302+
// Multi-replica or non-rollout Unschedulable is a real issue
303+
isRealFailure = true
304+
foundRealFailure = true
305+
a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
306+
}
265307
break
266308
}
267309
}
@@ -278,6 +320,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
278320
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
279321
// These are real failures, not temporary disruptions
280322
isRealFailure = true
323+
foundRealFailure = true
281324
a.logger.Debugf("Pod %s has container error %s - real failure, not disruption", pod.Name, reason)
282325
}
283326
}
@@ -292,6 +335,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
292335
isExpectedDisruption = true
293336
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
294337
isRealFailure = true
338+
foundRealFailure = true
295339
a.logger.Debugf("Pod %s has init container error %s - real failure, not disruption", pod.Name, reason)
296340
}
297341
}
@@ -302,17 +346,19 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
302346
continue
303347
}
304348

305-
// If it's in expected disruption state, return true
349+
// If it's in expected disruption state, mark it
306350
if isExpectedDisruption {
307351
a.logger.Debugf("Pod %s is in expected disruption state", pod.Name)
308-
return true
352+
foundExpectedDisruption = true
353+
continue
309354
}
310355

311356
// If pending without clear container status, check if it's just being scheduled
312357
// This could be normal pod creation during node drain
313358
if len(pod.Status.ContainerStatuses) == 0 && len(pod.Status.InitContainerStatuses) == 0 {
314359
a.logger.Debugf("Pod %s is pending without container statuses - likely being scheduled", pod.Name)
315-
return true
360+
foundExpectedDisruption = true
361+
continue
316362
}
317363
}
318364

@@ -323,14 +369,85 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
323369
switch reason {
324370
case "ContainerCreating", "PodInitializing":
325371
a.logger.Debugf("Pod %s container is starting - expected disruption", pod.Name)
326-
return true
372+
foundExpectedDisruption = true
327373
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff":
328374
// Real failures - don't treat as disruption
329375
a.logger.Debugf("Pod %s has container error %s - not treating as disruption", pod.Name, reason)
376+
foundRealFailure = true
330377
}
331378
}
332379
}
333380
}
381+
382+
// After checking all pods, make a decision
383+
// If we found expected disruption and no real failures, treat as disruption
384+
if foundExpectedDisruption && !foundRealFailure {
385+
a.logger.Infof("Deployment %s has pods in expected disruption state - will not mark CSV as Failed per Available contract", deploymentSpec.Name)
386+
return true
387+
}
388+
389+
// For single-replica deployments during rollout, if we found no real failures,
390+
// treat as expected disruption to comply with the OpenShift contract:
391+
// "A component must not report Available=False during the course of a normal upgrade."
392+
// Single-replica deployments inherently have unavailability during rollout,
393+
// but this is acceptable and should not trigger Available=False.
394+
if !foundRealFailure && desiredReplicas == 1 && isRollingOut {
395+
a.logger.Infof("Single-replica deployment %s is rolling out - treating as expected disruption per Available contract", deploymentSpec.Name)
396+
return true
397+
}
398+
}
399+
}
400+
401+
return false
402+
}
403+
404+
// hasAnyDeploymentInRollout checks if any deployment in the CSV is currently rolling out
405+
// or has unavailable replicas. This is used to detect potential lister cache sync issues
406+
// during SNO upgrades where the informer cache may not have synced yet after node reboot.
407+
func (a *Operator) hasAnyDeploymentInRollout(csv *v1alpha1.ClusterServiceVersion) bool {
408+
strategy, err := a.resolver.UnmarshalStrategy(csv.Spec.InstallStrategy)
409+
if err != nil {
410+
a.logger.Debugf("Unable to unmarshal strategy for CSV %s: %v", csv.Name, err)
411+
return false
412+
}
413+
414+
strategyDetailsDeployment, ok := strategy.(*v1alpha1.StrategyDetailsDeployment)
415+
if !ok {
416+
a.logger.Debugf("CSV %s does not use deployment strategy", csv.Name)
417+
return false
418+
}
419+
420+
for _, deploymentSpec := range strategyDetailsDeployment.DeploymentSpecs {
421+
deployment, err := a.lister.AppsV1().DeploymentLister().Deployments(csv.Namespace).Get(deploymentSpec.Name)
422+
if err != nil {
423+
// If deployment is not found, it could be:
424+
// 1. A real error (deployment never existed or was deleted)
425+
// 2. Cache sync issue after node reboot
426+
// We can't distinguish between these cases here, so we don't assume rollout.
427+
// The caller should handle this case appropriately.
428+
a.logger.Debugf("Error getting deployment %s: %v", deploymentSpec.Name, err)
429+
continue
430+
}
431+
432+
// Check if deployment is rolling out or has unavailable replicas
433+
// This covers various scenarios during cluster upgrades
434+
isRollingOut := deployment.Status.UnavailableReplicas > 0 ||
435+
deployment.Status.UpdatedReplicas < deployment.Status.Replicas ||
436+
deployment.Generation != deployment.Status.ObservedGeneration
437+
438+
// Also check if available replicas are less than desired (for single-replica case)
439+
if deployment.Spec.Replicas != nil {
440+
isRollingOut = isRollingOut || deployment.Status.AvailableReplicas < *deployment.Spec.Replicas
441+
}
442+
443+
if isRollingOut {
444+
a.logger.Debugf("Deployment %s is rolling out (unavailable=%d, updated=%d/%d, available=%d, generation=%d/%d)",
445+
deploymentSpec.Name,
446+
deployment.Status.UnavailableReplicas,
447+
deployment.Status.UpdatedReplicas, deployment.Status.Replicas,
448+
deployment.Status.AvailableReplicas,
449+
deployment.Status.ObservedGeneration, deployment.Generation)
450+
return true
334451
}
335452
}
336453

@@ -352,12 +469,19 @@ func (a *Operator) areAPIServicesAvailable(csv *v1alpha1.ClusterServiceVersion)
352469
if !install.IsAPIServiceAvailable(apiService) {
353470
a.logger.Debugf("APIService not available for %s", desc.GetName())
354471

355-
// Check if this unavailability is due to expected pod disruption
356-
// If so, we should not immediately mark as failed or trigger Progressing=True
357-
if a.isAPIServiceBackendDisrupted(csv, desc.GetName()) {
358-
a.logger.Infof("APIService %s unavailable due to pod disruption (e.g., node reboot), will retry", desc.GetName())
359-
// Return an error to trigger retry, but don't mark as definitively unavailable
360-
return false, olmerrors.NewRetryableError(fmt.Errorf("APIService %s temporarily unavailable due to pod disruption", desc.GetName()))
472+
// Only check for pod disruption when CSV is in Succeeded phase.
473+
// During Installing phase, deployment rollout is expected and should not
474+
// trigger the disruption detection logic.
475+
// The disruption detection is specifically for cluster upgrades/reboots
476+
// when a previously healthy CSV becomes temporarily unavailable.
477+
if csv.Status.Phase == v1alpha1.CSVPhaseSucceeded {
478+
// Check if this unavailability is due to expected pod disruption
479+
// If so, we should not immediately mark as failed or trigger Progressing=True
480+
if a.isAPIServiceBackendDisrupted(csv, desc.GetName()) {
481+
a.logger.Infof("APIService %s unavailable due to pod disruption (e.g., node reboot), will retry", desc.GetName())
482+
// Return an error to trigger retry, but don't mark as definitively unavailable
483+
return false, olmerrors.NewRetryableError(fmt.Errorf("APIService %s temporarily unavailable due to pod disruption", desc.GetName()))
484+
}
361485
}
362486

363487
return false, nil

0 commit comments

Comments
 (0)