77 hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
88 log "github.com/sirupsen/logrus"
99 appsv1 "k8s.io/api/apps/v1"
10+ corev1 "k8s.io/api/core/v1"
1011 apierrors "k8s.io/apimachinery/pkg/api/errors"
1112 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1213 "k8s.io/apimachinery/pkg/labels"
@@ -168,6 +169,87 @@ func (a *Operator) checkAPIServiceResources(csv *v1alpha1.ClusterServiceVersion,
168169 return utilerrors .NewAggregate (errs )
169170}
170171
172+ // isAPIServiceBackendDisrupted checks if the APIService is unavailable due to expected pod disruption
173+ // (e.g., during node reboot or cluster upgrade) rather than an actual failure.
174+ // According to the Progressing condition contract, operators should not report Progressing=True
175+ // only because pods are adjusting to new nodes or rebooting during cluster upgrade.
176+ func (a * Operator ) isAPIServiceBackendDisrupted (csv * v1alpha1.ClusterServiceVersion , apiServiceName string ) bool {
177+ // Get the deployment that backs this APIService
178+ // For most APIServices, the deployment name matches the CSV name or is specified in the CSV
179+
180+ // Try to find the deployment from the CSV's install strategy
181+ strategy , err := a .resolver .UnmarshalStrategy (csv .Spec .InstallStrategy )
182+ if err != nil {
183+ a .logger .Debugf ("Unable to unmarshal strategy for CSV %s: %v" , csv .Name , err )
184+ return false
185+ }
186+
187+ strategyDetailsDeployment , ok := strategy .(* v1alpha1.StrategyDetailsDeployment )
188+ if ! ok {
189+ a .logger .Debugf ("CSV %s does not use deployment strategy" , csv .Name )
190+ return false
191+ }
192+
193+ // Check each deployment's pods
194+ for _ , deploymentSpec := range strategyDetailsDeployment .DeploymentSpecs {
195+ deployment , err := a .lister .AppsV1 ().DeploymentLister ().Deployments (csv .Namespace ).Get (deploymentSpec .Name )
196+ if err != nil {
197+ if apierrors .IsNotFound (err ) {
198+ continue
199+ }
200+ a .logger .Debugf ("Error getting deployment %s: %v" , deploymentSpec .Name , err )
201+ continue
202+ }
203+
204+ // Check if deployment is being updated or rolling out
205+ if deployment .Status .UnavailableReplicas > 0 ||
206+ deployment .Status .UpdatedReplicas < deployment .Status .Replicas {
207+ a .logger .Debugf ("Deployment %s has unavailable replicas, likely due to pod disruption" , deploymentSpec .Name )
208+
209+ // Check pod status to confirm disruption
210+ selector , err := metav1 .LabelSelectorAsSelector (deployment .Spec .Selector )
211+ if err != nil {
212+ a .logger .Debugf ("Error parsing deployment selector: %v" , err )
213+ continue
214+ }
215+
216+ pods , err := a .lister .CoreV1 ().PodLister ().Pods (csv .Namespace ).List (selector )
217+ if err != nil {
218+ a .logger .Debugf ("Error listing pods: %v" , err )
219+ continue
220+ }
221+
222+ // Check if any pod is in Terminating or ContainerCreating state
223+ for _ , pod := range pods {
224+ // Pod is terminating (DeletionTimestamp is set)
225+ if pod .DeletionTimestamp != nil {
226+ a .logger .Debugf ("Pod %s is terminating - expected disruption" , pod .Name )
227+ return true
228+ }
229+
230+ // Pod is pending (being scheduled/created)
231+ if pod .Status .Phase == corev1 .PodPending {
232+ a .logger .Debugf ("Pod %s is pending - expected disruption" , pod .Name )
233+ return true
234+ }
235+
236+ // Check container statuses for restarting containers
237+ for _ , containerStatus := range pod .Status .ContainerStatuses {
238+ if containerStatus .State .Waiting != nil {
239+ reason := containerStatus .State .Waiting .Reason
240+ if reason == "ContainerCreating" || reason == "PodInitializing" {
241+ a .logger .Debugf ("Pod %s container is starting - expected disruption" , pod .Name )
242+ return true
243+ }
244+ }
245+ }
246+ }
247+ }
248+ }
249+
250+ return false
251+ }
252+
171253func (a * Operator ) areAPIServicesAvailable (csv * v1alpha1.ClusterServiceVersion ) (bool , error ) {
172254 for _ , desc := range csv .Spec .APIServiceDefinitions .Owned {
173255 apiService , err := a .lister .APIRegistrationV1 ().APIServiceLister ().Get (desc .GetName ())
@@ -182,6 +264,15 @@ func (a *Operator) areAPIServicesAvailable(csv *v1alpha1.ClusterServiceVersion)
182264
183265 if ! install .IsAPIServiceAvailable (apiService ) {
184266 a .logger .Debugf ("APIService not available for %s" , desc .GetName ())
267+
268+ // Check if this unavailability is due to expected pod disruption
269+ // If so, we should not immediately mark as failed or trigger Progressing=True
270+ if a .isAPIServiceBackendDisrupted (csv , desc .GetName ()) {
271+ a .logger .Infof ("APIService %s unavailable due to pod disruption (e.g., node reboot), will retry" , desc .GetName ())
272+ // Return an error to trigger retry, but don't mark as definitively unavailable
273+ return false , olmerrors .NewRetryableError (fmt .Errorf ("APIService %s temporarily unavailable due to pod disruption" , desc .GetName ()))
274+ }
275+
185276 return false , nil
186277 }
187278
0 commit comments