Skip to content

Commit 7a63f90

Browse files
committed
Fix: OLM should not report Progressing=True during pod disruption from cluster upgrades
1 parent 2dac489 commit 7a63f90

File tree

5 files changed

+772
-0
lines changed

5 files changed

+772
-0
lines changed

pkg/controller/errors/errors.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,21 @@ type GroupVersionKindNotFoundError struct {
7272
func (g GroupVersionKindNotFoundError) Error() string {
7373
return fmt.Sprintf("Unable to find GVK in discovery: %s %s %s", g.Group, g.Version, g.Kind)
7474
}
75+
76+
// RetryableError indicates a temporary error that should be retried.
77+
// This is used for expected transient failures like pod disruptions during cluster upgrades.
78+
type RetryableError struct {
79+
error
80+
}
81+
82+
func NewRetryableError(err error) RetryableError {
83+
return RetryableError{err}
84+
}
85+
86+
func IsRetryable(err error) bool {
87+
switch err.(type) {
88+
case RetryableError:
89+
return true
90+
}
91+
return false
92+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package errors
2+
3+
import (
4+
"errors"
5+
"testing"
6+
7+
"github.com/stretchr/testify/require"
8+
)
9+
10+
func TestRetryableError(t *testing.T) {
11+
baseErr := errors.New("test error")
12+
13+
retryErr := NewRetryableError(baseErr)
14+
require.True(t, IsRetryable(retryErr), "NewRetryableError should create a retryable error")
15+
require.Equal(t, baseErr.Error(), retryErr.Error(), "RetryableError should preserve the underlying error message")
16+
17+
normalErr := errors.New("normal error")
18+
require.False(t, IsRetryable(normalErr), "Normal error should not be retryable")
19+
}
20+
21+
func TestFatalError(t *testing.T) {
22+
baseErr := errors.New("test error")
23+
24+
fatalErr := NewFatalError(baseErr)
25+
require.True(t, IsFatal(fatalErr), "NewFatalError should create a fatal error")
26+
27+
normalErr := errors.New("normal error")
28+
require.False(t, IsFatal(normalErr), "Normal error should not be fatal")
29+
}

pkg/controller/operators/olm/apiservices.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
88
log "github.com/sirupsen/logrus"
99
appsv1 "k8s.io/api/apps/v1"
10+
corev1 "k8s.io/api/core/v1"
1011
apierrors "k8s.io/apimachinery/pkg/api/errors"
1112
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1213
"k8s.io/apimachinery/pkg/labels"
@@ -168,6 +169,87 @@ func (a *Operator) checkAPIServiceResources(csv *v1alpha1.ClusterServiceVersion,
168169
return utilerrors.NewAggregate(errs)
169170
}
170171

172+
// isAPIServiceBackendDisrupted checks if the APIService is unavailable due to expected pod disruption
173+
// (e.g., during node reboot or cluster upgrade) rather than an actual failure.
174+
// According to the Progressing condition contract, operators should not report Progressing=True
175+
// only because pods are adjusting to new nodes or rebooting during cluster upgrade.
176+
func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVersion, apiServiceName string) bool {
177+
// Get the deployment that backs this APIService
178+
// For most APIServices, the deployment name matches the CSV name or is specified in the CSV
179+
180+
// Try to find the deployment from the CSV's install strategy
181+
strategy, err := a.resolver.UnmarshalStrategy(csv.Spec.InstallStrategy)
182+
if err != nil {
183+
a.logger.Debugf("Unable to unmarshal strategy for CSV %s: %v", csv.Name, err)
184+
return false
185+
}
186+
187+
strategyDetailsDeployment, ok := strategy.(*v1alpha1.StrategyDetailsDeployment)
188+
if !ok {
189+
a.logger.Debugf("CSV %s does not use deployment strategy", csv.Name)
190+
return false
191+
}
192+
193+
// Check each deployment's pods
194+
for _, deploymentSpec := range strategyDetailsDeployment.DeploymentSpecs {
195+
deployment, err := a.lister.AppsV1().DeploymentLister().Deployments(csv.Namespace).Get(deploymentSpec.Name)
196+
if err != nil {
197+
if apierrors.IsNotFound(err) {
198+
continue
199+
}
200+
a.logger.Debugf("Error getting deployment %s: %v", deploymentSpec.Name, err)
201+
continue
202+
}
203+
204+
// Check if deployment is being updated or rolling out
205+
if deployment.Status.UnavailableReplicas > 0 ||
206+
deployment.Status.UpdatedReplicas < deployment.Status.Replicas {
207+
a.logger.Debugf("Deployment %s has unavailable replicas, likely due to pod disruption", deploymentSpec.Name)
208+
209+
// Check pod status to confirm disruption
210+
selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
211+
if err != nil {
212+
a.logger.Debugf("Error parsing deployment selector: %v", err)
213+
continue
214+
}
215+
216+
pods, err := a.lister.CoreV1().PodLister().Pods(csv.Namespace).List(selector)
217+
if err != nil {
218+
a.logger.Debugf("Error listing pods: %v", err)
219+
continue
220+
}
221+
222+
// Check if any pod is in Terminating or ContainerCreating state
223+
for _, pod := range pods {
224+
// Pod is terminating (DeletionTimestamp is set)
225+
if pod.DeletionTimestamp != nil {
226+
a.logger.Debugf("Pod %s is terminating - expected disruption", pod.Name)
227+
return true
228+
}
229+
230+
// Pod is pending (being scheduled/created)
231+
if pod.Status.Phase == corev1.PodPending {
232+
a.logger.Debugf("Pod %s is pending - expected disruption", pod.Name)
233+
return true
234+
}
235+
236+
// Check container statuses for restarting containers
237+
for _, containerStatus := range pod.Status.ContainerStatuses {
238+
if containerStatus.State.Waiting != nil {
239+
reason := containerStatus.State.Waiting.Reason
240+
if reason == "ContainerCreating" || reason == "PodInitializing" {
241+
a.logger.Debugf("Pod %s container is starting - expected disruption", pod.Name)
242+
return true
243+
}
244+
}
245+
}
246+
}
247+
}
248+
}
249+
250+
return false
251+
}
252+
171253
func (a *Operator) areAPIServicesAvailable(csv *v1alpha1.ClusterServiceVersion) (bool, error) {
172254
for _, desc := range csv.Spec.APIServiceDefinitions.Owned {
173255
apiService, err := a.lister.APIRegistrationV1().APIServiceLister().Get(desc.GetName())
@@ -182,6 +264,15 @@ func (a *Operator) areAPIServicesAvailable(csv *v1alpha1.ClusterServiceVersion)
182264

183265
if !install.IsAPIServiceAvailable(apiService) {
184266
a.logger.Debugf("APIService not available for %s", desc.GetName())
267+
268+
// Check if this unavailability is due to expected pod disruption
269+
// If so, we should not immediately mark as failed or trigger Progressing=True
270+
if a.isAPIServiceBackendDisrupted(csv, desc.GetName()) {
271+
a.logger.Infof("APIService %s unavailable due to pod disruption (e.g., node reboot), will retry", desc.GetName())
272+
// Return an error to trigger retry, but don't mark as definitively unavailable
273+
return false, olmerrors.NewRetryableError(fmt.Errorf("APIService %s temporarily unavailable due to pod disruption", desc.GetName()))
274+
}
275+
185276
return false, nil
186277
}
187278

0 commit comments

Comments
 (0)