Skip to content

Commit a457cd3

Browse files
authored
COH-24976 (#524)
* Fix issue where the Operator fails to patch ServiceMonitor resources, which then causes the rest of a Coherence deployment update to fail to be applied * Fix copyright headers * Failure of one or more secondary resource reconcilers should not block other reconcilers
1 parent caa48fa commit a457cd3

File tree

3 files changed

+137
-11
lines changed

3 files changed

+137
-11
lines changed

controllers/coherence_controller.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020, 2021, Oracle and/or its affiliates.
2+
* Copyright (c) 2020, 2022, Oracle and/or its affiliates.
33
* Licensed under the Universal Permissive License v 1.0 as shown at
44
* http://oss.oracle.com/licenses/upl.
55
*/
@@ -54,6 +54,12 @@ type CoherenceReconciler struct {
5454
reconcilers []reconciler.SecondaryResourceReconciler
5555
}
5656

57+
// Failure is a simple holder for a named error
58+
type Failure struct {
59+
Name string
60+
Error error
61+
}
62+
5763
// blank assignment to verify that CoherenceReconciler implements reconcile.Reconciler
5864
// There will be a compile-time error here if this breaks
5965
var _ reconcile.Reconciler = &CoherenceReconciler{}
@@ -211,15 +217,24 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
211217
}
212218

213219
// process the secondary resources in the order they should be created
220+
var failures []Failure
214221
for _, rec := range in.reconcilers {
215222
log.Info("Reconciling Coherence resource secondary resources", "controller", rec.GetControllerName())
216223
r, err := rec.ReconcileAllResourceOfKind(ctx, request, deployment, storage)
217224
if err != nil {
218-
return reconcile.Result{}, err
225+
failures = append(failures, Failure{Name: rec.GetControllerName(), Error: err})
219226
}
220227
result.Requeue = result.Requeue || r.Requeue
221228
}
222229

230+
if len(failures) > 0 {
231+
// one or more reconcilers failed:
232+
for _, failure := range failures {
233+
log.Error(failure.Error, "Secondary Reconciler failed", "Reconciler", failure.Name)
234+
}
235+
return reconcile.Result{}, fmt.Errorf("one or more secondary resource reconcilers failed to reconcile")
236+
}
237+
223238
// if replica count is zero update the status to Stopped
224239
if deployment.GetReplicas() == 0 {
225240
if err = in.UpdateDeploymentStatusPhase(ctx, request.NamespacedName, coh.ConditionTypeStopped); err != nil {

controllers/servicemonitor/servicemonitor_controller.go

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020, 2021, Oracle and/or its affiliates.
2+
* Copyright (c) 2020, 2022, Oracle and/or its affiliates.
33
* Licensed under the Universal Permissive License v 1.0 as shown at
44
* http://oss.oracle.com/licenses/upl.
55
*/
@@ -214,11 +214,46 @@ func (in *ReconcileServiceMonitor) UpdateServiceMonitor(ctx context.Context, nam
214214

215215
logger.Info("Patching ServiceMonitor")
216216
_, err = in.monClient.ServiceMonitors(namespace).Patch(ctx, name, in.GetPatchType(), data, metav1.PatchOptions{})
217-
if hashMatches {
218-
logger.Info("Patch applied to ServiceMonitor even though hashes matched (possible external update)")
219-
}
220217
if err != nil {
221-
return errors.Wrapf(err, "cannot patch ServiceMonitor %s/%s", namespace, name)
218+
// Patch or update failed - resort to an update with retry as sometimes custom resource (like ServiceMonitor) cannot be patched
219+
count := 1
220+
reason := "patch"
221+
for err != nil && count <= 5 {
222+
logger.Info(fmt.Sprintf("Failed to %s ServiceMonitor - retrying update", reason),
223+
"Attempt", count, "Error", err.Error())
224+
count++
225+
// re-fetch the current spec
226+
current, err = in.monClient.ServiceMonitors(namespace).Get(ctx, current.Name, metav1.GetOptions{})
227+
switch {
228+
case err != nil && apierrors.IsNotFound(err):
229+
// not found error so try creating the ServiceMonitor (shouldn't really get here!)
230+
reason = "create"
231+
_, err = in.monClient.ServiceMonitors(namespace).Create(ctx, desired.Spec.(*monitoring.ServiceMonitor), metav1.CreateOptions{})
232+
case err != nil:
233+
// Error reading the object - requeue the request.
234+
// We can't call the error handler as we do not even have an owning Coherence resource.
235+
// We log the error and do not requeue the request.
236+
logger.Info("Failed to re-fetch ServiceMonitor")
237+
default:
238+
// update the current spec
239+
reason = "update"
240+
current.Spec = desired.Spec.(*monitoring.ServiceMonitor).Spec
241+
_, err = in.monClient.ServiceMonitors(namespace).Update(ctx, current, metav1.UpdateOptions{})
242+
}
243+
}
244+
245+
if err != nil {
246+
logger.Info(fmt.Sprintf("Failed to %s ServiceMonitor %s - Gave up after %d attempts.", reason, name, count),
247+
"Error", err.Error())
248+
}
249+
}
250+
251+
if err == nil {
252+
if hashMatches {
253+
logger.Info("Update applied to ServiceMonitor even though hashes matched (possible external update)")
254+
} else {
255+
logger.Info("Update applied to ServiceMonitor")
256+
}
222257
}
223258

224259
return nil

test/e2e/prometheus/prometheus_test.go

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020, 2021, Oracle and/or its affiliates.
2+
* Copyright (c) 2020, 2022, Oracle and/or its affiliates.
33
* Licensed under the Universal Permissive License v 1.0 as shown at
44
* http://oss.oracle.com/licenses/upl.
55
*/
@@ -9,10 +9,16 @@ package prometheus
99
import (
1010
"encoding/json"
1111
"fmt"
12+
monitoring "github.com/coreos/prometheus-operator/pkg/apis/monitoring/v1"
13+
client "github.com/coreos/prometheus-operator/pkg/client/versioned/typed/monitoring/v1"
1214
. "github.com/onsi/gomega"
15+
coh "github.com/oracle/coherence-operator/api/v1"
1316
"github.com/oracle/coherence-operator/test/e2e/helper"
1417
"io/ioutil"
1518
corev1 "k8s.io/api/core/v1"
19+
apierrors "k8s.io/apimachinery/pkg/api/errors"
20+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
"k8s.io/apimachinery/pkg/types"
1622
"k8s.io/apimachinery/pkg/util/wait"
1723
"net/http"
1824
"strings"
@@ -29,20 +35,32 @@ func TestPrometheus(t *testing.T) {
2935
g.Expect(err).NotTo(HaveOccurred())
3036
g.Expect(ok).To(BeTrue(), "Cannot find any Prometheus Pods - this test requires Prometheus to have been installed")
3137

32-
AssertPrometheus(t, "prometheus-test.yaml", promPod)
38+
promClient, err := client.NewForConfig(testContext.Config)
39+
g.Expect(err).NotTo(HaveOccurred())
40+
41+
AssertPrometheus(t, "prometheus-test.yaml", promPod, promClient)
3342
}
3443

35-
func AssertPrometheus(t *testing.T, yamlFile string, promPod corev1.Pod) {
44+
func AssertPrometheus(t *testing.T, yamlFile string, promPod corev1.Pod, promClient *client.MonitoringV1Client) {
45+
g := NewGomegaWithT(t)
46+
3647
ShouldGetPrometheusConfig(t, promPod)
3748

3849
// Deploy the Coherence cluster
39-
_, cohPods := helper.AssertDeployments(testContext, t, yamlFile)
50+
deployments, cohPods := helper.AssertDeployments(testContext, t, yamlFile)
51+
deployment := deployments["test"]
52+
53+
err := ShouldEventuallyHaveServiceMonitor(t, deployment.Namespace, "test-metrics", promClient, 10*time.Second, 5*time.Minute)
54+
g.Expect(err).NotTo(HaveOccurred())
4055

4156
// Wait for Coherence metrics to appear in Prometheus
4257
ShouldEventuallySeeClusterMetrics(t, promPod, cohPods)
4358

4459
// Ensure that we can see the deployments size metric
4560
ShouldGetClusterSizeMetric(t, promPod)
61+
62+
// Ensure we can update the Coherence deployment and cause the ServiceMonitor to be updated
63+
ShouldPatchServiceMonitor(t, deployment, promClient)
4664
}
4765

4866
func IsPrometheusInstalled() (bool, corev1.Pod, error) {
@@ -107,6 +125,64 @@ func ShouldGetClusterSizeMetric(t *testing.T, pod corev1.Pod) {
107125
g.Expect(err).NotTo(HaveOccurred())
108126
}
109127

128+
func ShouldPatchServiceMonitor(t *testing.T, deployment coh.Coherence, promClient *client.MonitoringV1Client) {
129+
g := NewGomegaWithT(t)
130+
131+
current := &coh.Coherence{}
132+
err := testContext.Client.Get(testContext.Context, types.NamespacedName{Namespace: deployment.Namespace, Name: deployment.Name}, current)
133+
g.Expect(err).NotTo(HaveOccurred())
134+
135+
// update the ServiceMonitor interval to cause an update
136+
current.Spec.Ports[0].ServiceMonitor.Interval = "10s"
137+
err = testContext.Client.Update(testContext.Context, current)
138+
g.Expect(err).NotTo(HaveOccurred())
139+
140+
err = ShouldEventuallyHaveServiceMonitorWithState(t, deployment.Namespace, "test-metrics", hasInterval, promClient, 10*time.Second, 5*time.Minute)
141+
g.Expect(err).NotTo(HaveOccurred())
142+
143+
}
144+
145+
func ShouldEventuallyHaveServiceMonitor(t *testing.T, namespace, name string, promClient *client.MonitoringV1Client, retryInterval, timeout time.Duration) error {
146+
return ShouldEventuallyHaveServiceMonitorWithState(t, namespace, name, alwaysTrue, promClient, retryInterval, timeout)
147+
}
148+
149+
type ServiceMonitorPredicate func(*testing.T, *monitoring.ServiceMonitor) bool
150+
151+
func alwaysTrue(*testing.T, *monitoring.ServiceMonitor) bool {
152+
return true
153+
}
154+
155+
func hasInterval(t *testing.T, sm *monitoring.ServiceMonitor) bool {
156+
if len(sm.Spec.Endpoints) > 0 && sm.Spec.Endpoints[0].Interval == "10s" {
157+
return true
158+
}
159+
t.Logf("Waiting for availability of ServiceMonitor resource %s - with endpoint interval of 10s", sm.Name)
160+
return false
161+
}
162+
163+
func ShouldEventuallyHaveServiceMonitorWithState(t *testing.T, namespace, name string, predicate ServiceMonitorPredicate, promClient *client.MonitoringV1Client, retryInterval, timeout time.Duration) error {
164+
var sm *monitoring.ServiceMonitor
165+
166+
err := wait.Poll(retryInterval, timeout, func() (done bool, err error) {
167+
sm, err = promClient.ServiceMonitors(namespace).Get(testContext.Context, name, v1.GetOptions{})
168+
if err != nil {
169+
if apierrors.IsNotFound(err) {
170+
t.Logf("Waiting for availability of ServiceMonitor resource %s - NotFound", name)
171+
return false, nil
172+
}
173+
t.Logf("Waiting for availability of ServiceMonitor resource %s - %s", name, err.Error())
174+
return false, nil
175+
}
176+
if predicate(t, sm) {
177+
return true, nil
178+
}
179+
t.Logf("Waiting for availability of ServiceMonitor resource %s - %s to match predicate", name, err.Error())
180+
return false, nil
181+
})
182+
183+
return err
184+
}
185+
110186
func PrometheusQuery(t *testing.T, pod corev1.Pod, query string, result interface{}) error {
111187
r, err := PrometheusAPIRequest(pod, "/api/v1/query?query="+query)
112188
if err != nil {

0 commit comments

Comments
 (0)