From 87fb307bb89efaa6421797a5868f499e9ac24fe7 Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Wed, 27 Aug 2025 14:16:50 +0100 Subject: [PATCH] Fix: Truncate large error messages in status conditions When upgrading operators, CRD validation errors can be very large (50KB+). Kubernetes rejects status updates over 32KB with "Too long: may not be more than 32768 bytes". This causes ClusterExtension upgrades to fail and get stuck. Assisted-by: Cursor --- .../clusterextension_controller.go | 11 +- .../controllers/common_controller.go | 32 +++- .../controllers/common_controller_test.go | 176 ++++++++++++++++++ 3 files changed, 208 insertions(+), 11 deletions(-) diff --git a/internal/operator-controller/controllers/clusterextension_controller.go b/internal/operator-controller/controllers/clusterextension_controller.go index 24824bfd1..ce6f63c3a 100644 --- a/internal/operator-controller/controllers/clusterextension_controller.go +++ b/internal/operator-controller/controllers/clusterextension_controller.go @@ -22,7 +22,6 @@ import ( "fmt" "io/fs" "strings" - "time" "github.com/go-logr/logr" "helm.sh/helm/v3/pkg/release" @@ -156,15 +155,13 @@ func ensureAllConditionsWithReason(ext *ocv1.ClusterExtension, reason v1alpha1.C cond := apimeta.FindStatusCondition(ext.Status.Conditions, condType) if cond == nil { // Create a new condition with a valid reason and add it - newCond := metav1.Condition{ + SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ Type: condType, Status: metav1.ConditionFalse, Reason: string(reason), Message: message, ObservedGeneration: ext.GetGeneration(), - LastTransitionTime: metav1.NewTime(time.Now()), - } - ext.Status.Conditions = append(ext.Status.Conditions, newCond) + }) } } } @@ -381,7 +378,7 @@ func SetDeprecationStatus(ext *ocv1.ClusterExtension, bundleName string, depreca if len(deprecationMessages) > 0 { status, reason, message = metav1.ConditionTrue, ocv1.ReasonDeprecated, strings.Join(deprecationMessages, ";") } - apimeta.SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ + SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ Type: ocv1.TypeDeprecated, Reason: reason, Status: status, @@ -403,7 +400,7 @@ func SetDeprecationStatus(ext *ocv1.ClusterExtension, bundleName string, depreca message = fmt.Sprintf("%s\n%s", message, entry.Message) } } - apimeta.SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ + SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ Type: conditionType, Reason: reason, Status: status, diff --git a/internal/operator-controller/controllers/common_controller.go b/internal/operator-controller/controllers/common_controller.go index 7cee10c10..9195a83f9 100644 --- a/internal/operator-controller/controllers/common_controller.go +++ b/internal/operator-controller/controllers/common_controller.go @@ -27,6 +27,30 @@ import ( ocv1 "github.com/operator-framework/operator-controller/api/v1" ) +const ( + // maxConditionMessageLength set the max message length allowed by Kubernetes. + maxConditionMessageLength = 32768 + // truncationSuffix is the suffix added when a message is cut. + truncationSuffix = "\n\n... [message truncated]" +) + +// truncateMessage cuts long messages to fit Kubernetes condition limits +func truncateMessage(message string) string { + if len(message) <= maxConditionMessageLength { + return message + } + + maxContent := maxConditionMessageLength - len(truncationSuffix) + return message[:maxContent] + truncationSuffix +} + +// SetStatusCondition wraps apimeta.SetStatusCondition and ensures the message is always truncated +// This should be used throughout the codebase instead of apimeta.SetStatusCondition directly +func SetStatusCondition(conditions *[]metav1.Condition, condition metav1.Condition) { + condition.Message = truncateMessage(condition.Message) + apimeta.SetStatusCondition(conditions, condition) +} + // setInstalledStatusFromBundle sets the installed status based on the given installedBundle. func setInstalledStatusFromBundle(ext *ocv1.ClusterExtension, installedBundle *InstalledBundle) { // Nothing is installed @@ -45,7 +69,7 @@ func setInstalledStatusFromBundle(ext *ocv1.ClusterExtension, installedBundle *I // setInstalledStatusConditionSuccess sets the installed status condition to success. func setInstalledStatusConditionSuccess(ext *ocv1.ClusterExtension, message string) { - apimeta.SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ + SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ Type: ocv1.TypeInstalled, Status: metav1.ConditionTrue, Reason: ocv1.ReasonSucceeded, @@ -56,7 +80,7 @@ func setInstalledStatusConditionSuccess(ext *ocv1.ClusterExtension, message stri // setInstalledStatusConditionFailed sets the installed status condition to failed. func setInstalledStatusConditionFailed(ext *ocv1.ClusterExtension, message string) { - apimeta.SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ + SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ Type: ocv1.TypeInstalled, Status: metav1.ConditionFalse, Reason: ocv1.ReasonFailed, @@ -67,7 +91,7 @@ func setInstalledStatusConditionFailed(ext *ocv1.ClusterExtension, message strin // setInstalledStatusConditionUnknown sets the installed status condition to unknown. func setInstalledStatusConditionUnknown(ext *ocv1.ClusterExtension, message string) { - apimeta.SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ + SetStatusCondition(&ext.Status.Conditions, metav1.Condition{ Type: ocv1.TypeInstalled, Status: metav1.ConditionUnknown, Reason: ocv1.ReasonFailed, @@ -99,5 +123,5 @@ func setStatusProgressing(ext *ocv1.ClusterExtension, err error) { progressingCond.Reason = ocv1.ReasonBlocked } - apimeta.SetStatusCondition(&ext.Status.Conditions, progressingCond) + SetStatusCondition(&ext.Status.Conditions, progressingCond) } diff --git a/internal/operator-controller/controllers/common_controller_test.go b/internal/operator-controller/controllers/common_controller_test.go index 7b644172d..057a2c9dc 100644 --- a/internal/operator-controller/controllers/common_controller_test.go +++ b/internal/operator-controller/controllers/common_controller_test.go @@ -2,6 +2,8 @@ package controllers import ( "errors" + "fmt" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -64,3 +66,177 @@ func TestSetStatusProgressing(t *testing.T) { }) } } + +func TestTruncateMessage(t *testing.T) { + tests := []struct { + name string + message string + expected string + }{ + { + name: "short message unchanged", + message: "This is a short message", + expected: "This is a short message", + }, + { + name: "empty message unchanged", + message: "", + expected: "", + }, + { + name: "exact max length message unchanged", + message: strings.Repeat("a", maxConditionMessageLength), + expected: strings.Repeat("a", maxConditionMessageLength), + }, + { + name: "message just over limit gets truncated", + message: strings.Repeat("a", maxConditionMessageLength+1), + expected: strings.Repeat("a", maxConditionMessageLength-len(truncationSuffix)) + truncationSuffix, + }, + { + name: "very long message gets truncated", + message: strings.Repeat("word ", 10000) + "finalword", + expected: strings.Repeat("word ", 10000)[:maxConditionMessageLength-len(truncationSuffix)] + truncationSuffix, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := truncateMessage(tc.message) + require.Equal(t, tc.expected, result) + + // Verify the result is within the limit + require.LessOrEqual(t, len(result), maxConditionMessageLength, + "truncated message should not exceed max length") + + // If the original message was over the limit, verify truncation occurred + if len(tc.message) > maxConditionMessageLength { + require.Contains(t, result, truncationSuffix, + "long messages should contain truncation suffix") + require.Less(t, len(result), len(tc.message), + "truncated message should be shorter than original") + } + }) + } +} + +func TestSetStatusProgressingWithLongMessage(t *testing.T) { + // Simulate a real ClusterExtension CRD upgrade safety check failure with many validation errors + longError := fmt.Sprintf("validating CRD upgrade safety for ClusterExtension 'my-operator': %s", + strings.Repeat("CRD \"myresources.example.com\" v1beta1->v1: field .spec.replicas changed from optional to required, field .spec.config.timeout type changed from string to integer, field .status.conditions[].observedGeneration removed\n", 500)) + + ext := &ocv1.ClusterExtension{ObjectMeta: metav1.ObjectMeta{Name: "my-operator"}} + err := errors.New(longError) + setStatusProgressing(ext, err) + + cond := meta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, cond) + require.LessOrEqual(t, len(cond.Message), maxConditionMessageLength) + require.Contains(t, cond.Message, truncationSuffix) + require.Contains(t, cond.Message, "validating CRD upgrade safety") +} + +func TestClusterExtensionDeprecationMessageTruncation(t *testing.T) { + // Test truncation for ClusterExtension deprecation warnings with many deprecated APIs + ext := &ocv1.ClusterExtension{ObjectMeta: metav1.ObjectMeta{Name: "legacy-operator"}} + + // Simulate many deprecation warnings that would overflow the message limit + deprecationMessages := []string{} + for i := 0; i < 1000; i++ { + deprecationMessages = append(deprecationMessages, fmt.Sprintf("API version 'v1beta1' of resource 'customresources%d.example.com' is deprecated, use 'v1' instead", i)) + } + + longDeprecationMsg := strings.Join(deprecationMessages, "; ") + setInstalledStatusConditionUnknown(ext, longDeprecationMsg) + + cond := meta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, cond) + require.LessOrEqual(t, len(cond.Message), maxConditionMessageLength) + require.Contains(t, cond.Message, truncationSuffix, "deprecation messages should be truncated when too long") + require.Contains(t, cond.Message, "API version", "should preserve important deprecation context") +} + +func TestClusterExtensionInstallationFailureTruncation(t *testing.T) { + // Test truncation for ClusterExtension installation failures with many bundle validation errors + installError := "failed to install ClusterExtension 'argocd-operator': bundle validation errors: " + + strings.Repeat("resource 'deployments/argocd-server' missing required label 'app.kubernetes.io/name', resource 'services/argocd-server-metrics' has invalid port configuration, resource 'configmaps/argocd-cm' contains invalid YAML in data field 'application.yaml'\n", 400) + + ext := &ocv1.ClusterExtension{ObjectMeta: metav1.ObjectMeta{Name: "argocd-operator"}} + setInstalledStatusConditionFailed(ext, installError) + + cond := meta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, cond) + + // Verify message was truncated due to length + require.LessOrEqual(t, len(cond.Message), maxConditionMessageLength) + require.Contains(t, cond.Message, truncationSuffix, "installation failure messages should be truncated when too long") + require.Contains(t, cond.Message, "failed to install ClusterExtension", "should preserve important context") + require.Equal(t, metav1.ConditionFalse, cond.Status) + require.Equal(t, ocv1.ReasonFailed, cond.Reason) + + // Verify original message was actually longer than the limit + require.Greater(t, len(installError), maxConditionMessageLength, "test should use a message that exceeds the limit") +} + +func TestSetStatusConditionWrapper(t *testing.T) { + tests := []struct { + name string + message string + expectedTruncated bool + }{ + { + name: "short message not truncated", + message: "This is a short message", + expectedTruncated: false, + }, + { + name: "long message gets truncated", + message: strings.Repeat("This is a very long message. ", 2000), + expectedTruncated: true, + }, + { + name: "message at exact limit not truncated", + message: strings.Repeat("a", maxConditionMessageLength), + expectedTruncated: false, + }, + { + name: "message over limit gets truncated", + message: strings.Repeat("a", maxConditionMessageLength+1), + expectedTruncated: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + var conditions []metav1.Condition + + // Use our wrapper function + SetStatusCondition(&conditions, metav1.Condition{ + Type: "TestCondition", + Status: metav1.ConditionTrue, + Reason: "Testing", + Message: tc.message, + }) + + require.Len(t, conditions, 1, "should have exactly one condition") + cond := conditions[0] + + // Verify message is within limits + require.LessOrEqual(t, len(cond.Message), maxConditionMessageLength, + "condition message should not exceed max length") + + // Check if truncation occurred as expected + if tc.expectedTruncated { + require.Contains(t, cond.Message, truncationSuffix, + "long messages should contain truncation suffix") + require.Less(t, len(cond.Message), len(tc.message), + "truncated message should be shorter than original") + } else { + require.Equal(t, tc.message, cond.Message, + "short messages should remain unchanged") + require.NotContains(t, cond.Message, truncationSuffix, + "short messages should not contain truncation suffix") + } + }) + } +}