Skip to content
This repository was archived by the owner on Dec 16, 2025. It is now read-only.

Commit 4ead3ef

Browse files
committed
Update controller logic when some failure occurs
An infinite loop may occur on certain misconfigurations, e.g., if the Github/OpenStack client is misconfigured. In such cases, the operator correctly sets a condition that can be used for the user to debug. However, it also returns an error, which means that the operator reconciles immediately again. There is no need to return the error, as the user checks the conditions stored in the status of the objectrather than potential error logs. The controller's default syncPeriod, set to 5 minutes, ensures that the controller will try again later to check whether the failed state has been resolved. See related SovereignCloudStack/cluster-stack-operator#50 Signed-off-by: Matej Feder <[email protected]>
1 parent eb89331 commit 4ead3ef

File tree

3 files changed

+55
-38
lines changed

3 files changed

+55
-38
lines changed

cmd/main.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package main
2020
import (
2121
"flag"
2222
"os"
23+
"time"
2324

2425
githubclient "github.com/SovereignCloudStack/cluster-stack-operator/pkg/github/client"
2526
infrastructureclusterstackxk8siov1alpha1 "github.com/sovereignCloudStack/cluster-stack-provider-openstack/api/v1alpha1"
@@ -31,6 +32,7 @@ import (
3132
// to ensure that exec-entrypoint and run can make use of them.
3233
_ "k8s.io/client-go/plugin/pkg/client/auth"
3334
ctrl "sigs.k8s.io/controller-runtime"
35+
"sigs.k8s.io/controller-runtime/pkg/cache"
3436
"sigs.k8s.io/controller-runtime/pkg/healthz"
3537
"sigs.k8s.io/controller-runtime/pkg/log/zap"
3638
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
@@ -49,8 +51,8 @@ func init() {
4951
}
5052

5153
var (
52-
releaseDir string
53-
waitForImageBecomeActiveMinutes int
54+
releaseDir string
55+
imageImportTimeout int
5456
)
5557

5658
func main() {
@@ -63,7 +65,7 @@ func main() {
6365
"Enable leader election for controller manager. "+
6466
"Enabling this will ensure there is only one active controller manager.")
6567
flag.StringVar(&releaseDir, "release-dir", "/tmp/downloads/", "Specify release directory for cluster-stack releases")
66-
flag.IntVar(&waitForImageBecomeActiveMinutes, "import-timeout", 0, "Maximum time in minutes that you allow cspo to import image. If import-timeout <= 0, cspo waits forever.")
68+
flag.IntVar(&imageImportTimeout, "image-import-timeout", 0, "Maximum time in minutes that you allow cspo to import image. If image-import-timeout <= 0, cspo waits forever.")
6769
opts := zap.Options{
6870
Development: true,
6971
}
@@ -72,6 +74,8 @@ func main() {
7274

7375
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
7476

77+
syncPeriod := 5 * time.Minute
78+
7579
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
7680
Scheme: scheme,
7781
Metrics: metricsserver.Options{BindAddress: metricsAddr},
@@ -89,6 +93,9 @@ func main() {
8993
// if you are doing or is intended to do any operation such as perform cleanups
9094
// after the manager stops then its usage might be unsafe.
9195
// LeaderElectionReleaseOnCancel: true,
96+
Cache: cache.Options{
97+
SyncPeriod: &syncPeriod,
98+
},
9299
})
93100
if err != nil {
94101
setupLog.Error(err, "unable to start manager")
@@ -107,9 +114,9 @@ func main() {
107114
os.Exit(1)
108115
}
109116
if err = (&controller.OpenStackNodeImageReleaseReconciler{
110-
Client: mgr.GetClient(),
111-
Scheme: mgr.GetScheme(),
112-
WaitForImageBecomeActiveMinutes: waitForImageBecomeActiveMinutes,
117+
Client: mgr.GetClient(),
118+
Scheme: mgr.GetScheme(),
119+
ImageImportTimeout: imageImportTimeout,
113120
}).SetupWithManager(mgr); err != nil {
114121
setupLog.Error(err, "unable to create controller", "controller", "OpenStackNodeImageRelease")
115122
os.Exit(1)

internal/controller/openstackclusterstackrelease_controller.go

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ const (
6363
metadataFileName = "metadata.yaml"
6464
nodeImagesFileName = "node-images.yaml"
6565
waitForOpenStackNodeImageReleasesBecomeReady = 30 * time.Second
66-
reconcileOpenStackNodeImageReleases = 3 * time.Minute
6766
)
6867

6968
//+kubebuilder:rbac:groups=infrastructure.clusterstack.x-k8s.io,resources=openstackclusterstackreleases,verbs=get;list;watch;create;update;patch;delete
@@ -113,7 +112,8 @@ func (r *OpenStackClusterStackReleaseReconciler) Reconcile(ctx context.Context,
113112
err.Error(),
114113
)
115114
record.Warnf(openstackclusterstackrelease, "GitTokenOrEnvVariableNotSet", err.Error())
116-
return ctrl.Result{}, fmt.Errorf("failed to create Github client: %w", err)
115+
logger.Error(err, "failed to create Github client")
116+
return ctrl.Result{}, nil
117117
}
118118

119119
conditions.MarkTrue(openstackclusterstackrelease, apiv1alpha1.GitAPIAvailableCondition)
@@ -140,7 +140,7 @@ func (r *OpenStackClusterStackReleaseReconciler) Reconcile(ctx context.Context,
140140
r.openStackClusterStackRelDownloadDirectoryMutex.Lock()
141141

142142
if err := downloadReleaseAssets(ctx, releaseTag, releaseAssets.LocalDownloadPath, gc); err != nil {
143-
return ctrl.Result{}, fmt.Errorf("failed to download release assets: %w", err)
143+
return ctrl.Result{RequeueAfter: 1 * time.Minute}, fmt.Errorf("failed to download release assets: %w", err)
144144
}
145145

146146
r.openStackClusterStackRelDownloadDirectoryMutex.Unlock()
@@ -170,7 +170,7 @@ func (r *OpenStackClusterStackReleaseReconciler) Reconcile(ctx context.Context,
170170

171171
osnirName := fmt.Sprintf("%s-%s-%s", nameWithoutVersion, openStackNodeImage.CreateOpts.Name, nodeImageVersion)
172172

173-
if err := r.getOrCreateOpenStackNodeImageRelease(ctx, openstackclusterstackrelease, osnirName, openStackNodeImage, ownerRef); err != nil {
173+
if err := r.createOrUpdateOpenStackNodeImageRelease(ctx, openstackclusterstackrelease, osnirName, openStackNodeImage, ownerRef); err != nil {
174174
return ctrl.Result{}, fmt.Errorf("failed to get or create OpenStackNodeImageRelease %s/%s: %w", openstackclusterstackrelease.Namespace, osnirName, err)
175175
}
176176
}
@@ -209,11 +209,10 @@ func (r *OpenStackClusterStackReleaseReconciler) Reconcile(ctx context.Context,
209209
conditions.MarkTrue(openstackclusterstackrelease, apiv1alpha1.OpenStackNodeImageReleasesReadyCondition)
210210
openstackclusterstackrelease.Status.Ready = true
211211

212-
// Requeue to ensure the OpenStackNodeImageReleases are still ready
213-
return ctrl.Result{Requeue: true, RequeueAfter: reconcileOpenStackNodeImageReleases}, nil
212+
return ctrl.Result{}, nil
214213
}
215214

216-
func (r *OpenStackClusterStackReleaseReconciler) getOrCreateOpenStackNodeImageRelease(ctx context.Context, openstackclusterstackrelease *apiv1alpha1.OpenStackClusterStackRelease, osnirName string, openStackNodeImage *apiv1alpha1.OpenStackNodeImage, ownerRef *metav1.OwnerReference) error {
215+
func (r *OpenStackClusterStackReleaseReconciler) createOrUpdateOpenStackNodeImageRelease(ctx context.Context, openstackclusterstackrelease *apiv1alpha1.OpenStackClusterStackRelease, osnirName string, openStackNodeImage *apiv1alpha1.OpenStackNodeImage, ownerRef *metav1.OwnerReference) error {
217216
openStackNodeImageRelease := &apiv1alpha1.OpenStackNodeImageRelease{}
218217

219218
err := r.Get(ctx, types.NamespacedName{Name: osnirName, Namespace: openstackclusterstackrelease.Namespace}, openStackNodeImageRelease)
@@ -252,10 +251,7 @@ func (r *OpenStackClusterStackReleaseReconciler) getOrCreateOpenStackNodeImageRe
252251
openStackNodeImageRelease.Spec.IdentityRef = openstackclusterstackrelease.Spec.IdentityRef
253252

254253
if err := r.Create(ctx, openStackNodeImageRelease); err != nil {
255-
record.Eventf(openStackNodeImageRelease,
256-
"ErrorOpenStackNodeImageRelease",
257-
"failed to create %s OpenStackNodeImageRelease: %s", osnirName, err.Error(),
258-
)
254+
record.Warnf(openStackNodeImageRelease, "ErrorOpenStackNodeImageRelease", err.Error())
259255
return fmt.Errorf("failed to create OpenStackNodeImageRelease: %w", err)
260256
}
261257

internal/controller/openstacknodeimagerelease_controller.go

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,13 @@ import (
4545
// OpenStackNodeImageReleaseReconciler reconciles a OpenStackNodeImageRelease object.
4646
type OpenStackNodeImageReleaseReconciler struct {
4747
client.Client
48-
Scheme *runtime.Scheme
49-
WaitForImageBecomeActiveMinutes int
48+
Scheme *runtime.Scheme
49+
ImageImportTimeout int
5050
}
5151

5252
const (
5353
cloudsSecretKey = "clouds.yaml"
5454
waitForImageBecomeActive = 30 * time.Second
55-
reconcileImage = 3 * time.Minute
5655
)
5756

5857
//+kubebuilder:rbac:groups=infrastructure.clusterstack.x-k8s.io,resources=openstacknodeimagereleases,verbs=get;list;watch;create;update;patch;delete
@@ -113,7 +112,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
113112
opts := &clientconfig.ClientOpts{AuthInfo: cloud.AuthInfo}
114113
providerClient, err := clientconfig.AuthenticatedClient(opts)
115114
if err != nil {
116-
return ctrl.Result{}, fmt.Errorf("failed to create a provider client: %w", err)
115+
record.Warnf(openstacknodeimagerelease, "OpenStackProviderClientNotSet", err.Error())
116+
logger.Error(err, "failed to create a provider client")
117+
return ctrl.Result{}, nil
117118
}
118119

119120
// Create an OpenStack image service client
@@ -126,7 +127,8 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
126127
err.Error(),
127128
)
128129
record.Warnf(openstacknodeimagerelease, "OpenStackImageServiceClientNotSet", err.Error())
129-
return ctrl.Result{}, fmt.Errorf("failed to create an image client: %w", err)
130+
logger.Error(err, "failed to create an image client")
131+
return ctrl.Result{}, nil
130132
}
131133

132134
conditions.MarkTrue(openstacknodeimagerelease, apiv1alpha1.OpenStackImageServiceClientAvailableCondition)
@@ -139,7 +141,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
139141
clusterv1beta1.ConditionSeverityError,
140142
err.Error(),
141143
)
142-
return ctrl.Result{}, fmt.Errorf("failed to find an image: %w", err)
144+
record.Warnf(openstacknodeimagerelease, "OpenStackImageFailedToFind", err.Error())
145+
logger.Error(err, "failed to find an image")
146+
return ctrl.Result{}, nil
143147
}
144148

145149
if imageID == "" {
@@ -156,7 +160,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
156160
clusterv1beta1.ConditionSeverityError,
157161
err.Error(),
158162
)
159-
return ctrl.Result{}, fmt.Errorf("failed to create an image: %w", err)
163+
record.Warnf(openstacknodeimagerelease, "OpenStackImageFailedToCreate", err.Error())
164+
logger.Error(err, "failed to create an image")
165+
return ctrl.Result{}, nil
160166
}
161167

162168
imageImportOpts := imageimport.CreateOpts{
@@ -171,7 +177,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
171177
clusterv1beta1.ConditionSeverityError,
172178
err.Error(),
173179
)
174-
return ctrl.Result{}, fmt.Errorf("failed to import an image: %w", err)
180+
record.Warnf(openstacknodeimagerelease, "OpenStackImageFailedToImport", err.Error())
181+
logger.Error(err, "failed to import an image")
182+
return ctrl.Result{}, nil
175183
}
176184

177185
conditions.MarkTrue(openstacknodeimagerelease, apiv1alpha1.OpenStackImageImportStartedCondition)
@@ -188,27 +196,30 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
188196
clusterv1beta1.ConditionSeverityError,
189197
err.Error(),
190198
)
191-
return ctrl.Result{}, fmt.Errorf("failed to get an image: %w", err)
199+
record.Warnf(openstacknodeimagerelease, "OpenStackImageFailedToGet", err.Error())
200+
logger.Error(err, "failed to get an image")
201+
return ctrl.Result{}, nil
192202
}
193203

194-
// Check wait for image ACTIVE status duration
195-
if r.WaitForImageBecomeActiveMinutes > 0 && conditions.IsTrue(openstacknodeimagerelease, apiv1alpha1.OpenStackImageImportStartedCondition) {
196-
// Calculate elapsed time since the OpenStackImageImportStartedCondition is true
204+
// Ensure that the import does not exceed the timeout duration
205+
if r.ImageImportTimeout > 0 && conditions.IsTrue(openstacknodeimagerelease, apiv1alpha1.OpenStackImageImportStartedCondition) {
206+
// Calculate elapsed time since the OpenStackImageImportStartCondition is true
197207
startTime := conditions.GetLastTransitionTime(openstacknodeimagerelease, apiv1alpha1.OpenStackImageImportStartedCondition)
198208
elapsedTime := time.Since(startTime.Time)
199209

200-
waitForImageBecomeActiveTimeout := time.Duration(r.WaitForImageBecomeActiveMinutes) * time.Minute
210+
imageImportTimeout := time.Duration(r.ImageImportTimeout) * time.Minute
201211

202212
// Check if the image has been active after waitForImageBecomeActiveTimeout minutes
203-
if image.Status != images.ImageStatusActive && elapsedTime > waitForImageBecomeActiveTimeout {
204-
err = fmt.Errorf("timeout - wait for the image %s to transition to the ACTIVE status exceeds the timeout duration %d minutes", image.Name, r.WaitForImageBecomeActiveMinutes)
205-
logger.Error(err, "Timeout duration exceeded")
213+
if image.Status != images.ImageStatusActive && elapsedTime > imageImportTimeout {
214+
err = fmt.Errorf("timeout - wait for the image %s to transition to the ACTIVE status exceeds the timeout duration %d minutes", image.Name, r.ImageImportTimeout)
206215
conditions.MarkFalse(openstacknodeimagerelease,
207216
apiv1alpha1.OpenStackImageReadyCondition,
208217
apiv1alpha1.OpenStackImageImportTimeOutReason,
209218
clusterv1beta1.ConditionSeverityError,
210219
err.Error(),
211220
)
221+
record.Warnf(openstacknodeimagerelease, "OpenStackImageImportTimeout", err.Error())
222+
logger.Error(err, "timeout - import duration exceeded")
212223
// Image import timeout - nothing to do
213224
return ctrl.Result{}, nil
214225
}
@@ -233,7 +244,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
233244
err.Error(),
234245
)
235246
openstacknodeimagerelease.Status.Ready = false
236-
return ctrl.Result{}, err
247+
record.Warnf(openstacknodeimagerelease, "OpenStackImageStatusUnexpected", err.Error())
248+
logger.Error(err, "unexpected image status")
249+
return ctrl.Result{}, nil
237250

238251
case images.ImageStatusQueued, images.ImageStatusSaving, images.ImageStatusDeleted, images.ImageStatusPendingDelete, images.ImageStatusImporting:
239252
// The other statuses are expected. See the explanation below:
@@ -242,7 +255,7 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
242255
logger.Info("OpenStackNodeImageRelease **not ready** yet - waiting for image to become ACTIVE", "name", openstacknodeimagerelease.Spec.Image.CreateOpts.Name, "ID", imageID, "status", image.Status)
243256
conditions.MarkFalse(openstacknodeimagerelease, apiv1alpha1.OpenStackImageReadyCondition, apiv1alpha1.OpenStackImageNotImportedYetReason, clusterv1beta1.ConditionSeverityInfo, "waiting for image to become ACTIVE")
244257
openstacknodeimagerelease.Status.Ready = false
245-
// Wait for image
258+
// Wait for image to become active
246259
return ctrl.Result{RequeueAfter: waitForImageBecomeActive}, nil
247260

248261
default:
@@ -255,11 +268,12 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
255268
err.Error(),
256269
)
257270
openstacknodeimagerelease.Status.Ready = false
258-
return ctrl.Result{}, err
271+
record.Warnf(openstacknodeimagerelease, "OpenStackImageStatusUnknown", err.Error())
272+
logger.Error(err, "unknown image status")
273+
return ctrl.Result{}, nil
259274
}
260275

261-
// Requeue to ensure the image's presence
262-
return ctrl.Result{Requeue: true, RequeueAfter: reconcileImage}, nil
276+
return ctrl.Result{}, nil
263277
}
264278

265279
func (r *OpenStackNodeImageReleaseReconciler) getCloudFromSecret(ctx context.Context, secretNamespace, secretName, cloudName string) (clientconfig.Cloud, error) {

0 commit comments

Comments
 (0)