Skip to content

Commit 4b231d4

Browse files
ansdZerpet
authored andcommitted
Do not exec on pods during rolling update
Fixes #304 Exec on pods only if StatefulSet is ready and up to date. Before this commit, we observed in #304 that the controller tried to exec into pods at the same time as the pods got updated due to a StatefulSet restart resulting in connection errors.
1 parent 22a628b commit 4b231d4

File tree

2 files changed

+48
-36
lines changed

2 files changed

+48
-36
lines changed

controllers/rabbitmqcluster_controller.go

Lines changed: 47 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,9 @@ func (r *RabbitmqClusterReconciler) Reconcile(req ctrl.Request) (ctrl.Result, er
198198
return ctrl.Result{}, err
199199
}
200200

201-
r.restartStatefulSetIfNeeded(ctx, builder, operationResult, rabbitmqCluster)
201+
if restarted := r.restartStatefulSetIfNeeded(ctx, builder, operationResult, rabbitmqCluster); restarted {
202+
return ctrl.Result{RequeueAfter: time.Second * 10}, nil
203+
}
202204
}
203205

204206
// Set ReconcileSuccess to true here because all CRUD operations to Kube API related
@@ -214,7 +216,7 @@ func (r *RabbitmqClusterReconciler) Reconcile(req ctrl.Request) (ctrl.Result, er
214216
return ctrl.Result{}, err
215217
}
216218

217-
if ok, err := r.allReplicasReady(ctx, rabbitmqCluster); !ok {
219+
if ok, err := r.allReplicasReadyAndUpdated(ctx, rabbitmqCluster); !ok {
218220
// only enable plugins when all pods of the StatefulSet become ready
219221
// requeue request after 10 seconds without error
220222
logger.Info("Not all replicas ready yet; requeuing request to enable plugins on RabbitmqCluster",
@@ -312,47 +314,58 @@ func (r *RabbitmqClusterReconciler) setAdminStatus(ctx context.Context, rmq *rab
312314
return nil
313315
}
314316

315-
// restartStatefulSetIfNeeded - helper function that annotates the StatefulSet PodTemplate with current timestamp
316-
// to trigger a restart of the all pods in the StatefulSet when builder requires StatefulSet to be updated
317-
func (r *RabbitmqClusterReconciler) restartStatefulSetIfNeeded(ctx context.Context, builder resource.ResourceBuilder, operationResult controllerutil.OperationResult, rmq *rabbitmqv1beta1.RabbitmqCluster) {
318-
if builder.UpdateRequiresStsRestart() && operationResult == controllerutil.OperationResultUpdated {
319-
if err := clientretry.RetryOnConflict(clientretry.DefaultRetry, func() error {
320-
sts := &appsv1.StatefulSet{ObjectMeta: metav1.ObjectMeta{Name: rmq.ChildResourceName("server"), Namespace: rmq.Namespace}}
321-
if err := r.Get(ctx, types.NamespacedName{Name: sts.Name, Namespace: sts.Namespace}, sts); err != nil {
322-
return err
323-
}
324-
if sts.Spec.Template.ObjectMeta.Annotations == nil {
325-
sts.Spec.Template.ObjectMeta.Annotations = make(map[string]string)
326-
}
327-
sts.Spec.Template.ObjectMeta.Annotations["rabbitmq.com/restartAt"] = time.Now().Format(time.RFC3339)
328-
return r.Update(ctx, sts)
329-
}); err != nil {
330-
msg := fmt.Sprintf("Failed to restart StatefulSet %s of Namespace %s; rabbitmq.conf configuration may be outdated", rmq.ChildResourceName("server"), rmq.Namespace)
331-
r.Log.Error(err, msg)
332-
r.Recorder.Event(rmq, corev1.EventTypeWarning, "FailedUpdate", msg)
317+
// Adds an arbitrary annotation (rabbitmq.com/lastRestartAt) to the StatefulSet PodTemplate to trigger a StatefulSet restart
318+
// if builder requires StatefulSet to be updated.
319+
func (r *RabbitmqClusterReconciler) restartStatefulSetIfNeeded(
320+
ctx context.Context,
321+
builder resource.ResourceBuilder,
322+
operationResult controllerutil.OperationResult,
323+
rmq *rabbitmqv1beta1.RabbitmqCluster) (restarted bool) {
324+
325+
if !(builder.UpdateRequiresStsRestart() && operationResult == controllerutil.OperationResultUpdated) {
326+
return false
327+
}
328+
329+
if err := clientretry.RetryOnConflict(clientretry.DefaultRetry, func() error {
330+
sts := &appsv1.StatefulSet{ObjectMeta: metav1.ObjectMeta{Name: rmq.ChildResourceName("server"), Namespace: rmq.Namespace}}
331+
if err := r.Get(ctx, types.NamespacedName{Name: sts.Name, Namespace: sts.Namespace}, sts); err != nil {
332+
return err
333333
}
334-
msg := fmt.Sprintf("Restarted StatefulSet %s of Namespace %s", rmq.ChildResourceName("server"), rmq.Namespace)
335-
r.Log.Info(msg)
336-
r.Recorder.Event(rmq, corev1.EventTypeNormal, "SuccessfulUpdate", msg)
334+
if sts.Spec.Template.ObjectMeta.Annotations == nil {
335+
sts.Spec.Template.ObjectMeta.Annotations = make(map[string]string)
336+
}
337+
sts.Spec.Template.ObjectMeta.Annotations["rabbitmq.com/lastRestartAt"] = time.Now().Format(time.RFC3339)
338+
return r.Update(ctx, sts)
339+
}); err != nil {
340+
msg := fmt.Sprintf("Failed to restart StatefulSet %s of Namespace %s; rabbitmq.conf configuration may be outdated", rmq.ChildResourceName("server"), rmq.Namespace)
341+
r.Log.Error(err, msg)
342+
r.Recorder.Event(rmq, corev1.EventTypeWarning, "FailedUpdate", msg)
343+
return false
337344
}
345+
346+
msg := fmt.Sprintf("Restarted StatefulSet %s of Namespace %s", rmq.ChildResourceName("server"), rmq.Namespace)
347+
r.Log.Info(msg)
348+
r.Recorder.Event(rmq, corev1.EventTypeNormal, "SuccessfulUpdate", msg)
349+
return true
338350
}
339351

340-
// allReplicasReady - helper function that checks if StatefulSet replicas are all ready
341-
func (r *RabbitmqClusterReconciler) allReplicasReady(ctx context.Context, rmq *rabbitmqv1beta1.RabbitmqCluster) (bool, error) {
352+
func (r *RabbitmqClusterReconciler) allReplicasReadyAndUpdated(ctx context.Context, rmq *rabbitmqv1beta1.RabbitmqCluster) (bool, error) {
342353
sts := &appsv1.StatefulSet{}
343354

344355
if err := r.Get(ctx, types.NamespacedName{Name: rmq.ChildResourceName("server"), Namespace: rmq.Namespace}, sts); err != nil {
345356
return false, client.IgnoreNotFound(err)
346357
}
347358

348-
if sts.Status.ReadyReplicas < *sts.Spec.Replicas {
359+
desiredReplicas := *sts.Spec.Replicas
360+
if sts.Status.ReadyReplicas < desiredReplicas ||
361+
sts.Status.UpdatedReplicas < desiredReplicas { // StatefulSet rolling update is still ongoing (see https://github.com/rabbitmq/cluster-operator/issues/304)
349362
return false, nil
350363
}
351364

352365
return true, nil
353366
}
354367

355-
// enablePlugins - helper function to set the list of enabled plugins in a given RabbitmqCluster pods
368+
// Helper function to set the list of enabled plugins in the given RabbitmqCluster pods.
356369
// `rabbitmq-plugins set` disables plugins that are not in the provided list
357370
func (r *RabbitmqClusterReconciler) enablePlugins(rmq *rabbitmqv1beta1.RabbitmqCluster) error {
358371
plugins := resource.NewRabbitmqPlugins(rmq.Spec.Rabbitmq.AdditionalPlugins)
@@ -363,11 +376,13 @@ func (r *RabbitmqClusterReconciler) enablePlugins(rmq *rabbitmqv1beta1.RabbitmqC
363376
stdout, stderr, err := r.exec(rmq.Namespace, podName, "rabbitmq", "sh", "-c", rabbitCommand)
364377

365378
if err != nil {
366-
367-
r.Log.Error(err, fmt.Sprintf(
368-
"Failed to enable plugins on pod %s in namespace %s, running command %s with output: %s %s",
369-
podName, rmq.Namespace, rabbitCommand, stdout, stderr))
370-
379+
r.Log.Error(err, "Failed to enable plugins",
380+
"namespace", rmq.Namespace,
381+
"name", rmq.Name,
382+
"pod", podName,
383+
"command", rabbitCommand,
384+
"stdout", stdout,
385+
"stderr", stderr)
371386
return err
372387
}
373388
}
@@ -407,12 +422,9 @@ func (r *RabbitmqClusterReconciler) exec(namespace, podName, containerName strin
407422
Stdin: nil,
408423
Tty: false,
409424
})
410-
411425
if err != nil {
412-
413426
return stdOut.String(), stdErr.String(), err
414427
}
415-
416428
if stdErr.Len() > 0 {
417429
return stdOut.String(), stdErr.String(), fmt.Errorf("%v", stdErr)
418430
}

internal/resource/configmap.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ func (builder *RabbitmqResourceBuilder) ServerConfigMap() *ServerConfigMapBuilde
5050
}
5151

5252
func (builder *ServerConfigMapBuilder) UpdateRequiresStsRestart() bool {
53-
return true
53+
return true // because rabbitmq.conf and advanced.config changes take effect only after a node restart
5454
}
5555

5656
func (builder *ServerConfigMapBuilder) Update(object runtime.Object) error {

0 commit comments

Comments
 (0)