Skip to content

Commit 3896e07

Browse files
committed
OCPBUGS-61508: IngressOperator not exposing some metrics for degraded IngressController
Signed-off-by: Davide Salerno <[email protected]>
1 parent a35f1da commit 3896e07

File tree

3 files changed

+145
-2
lines changed

3 files changed

+145
-2
lines changed

pkg/operator/controller/ingress/status.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,10 @@ func (r *reconciler) syncIngressControllerStatus(ic *operatorv1.IngressControlle
104104
errs = append(errs, fmt.Errorf("failed to update ingresscontroller status: %v", err))
105105
} else {
106106
updatedIc = true
107-
SetIngressControllerConditionsMetric(updated)
108107
}
109108
}
110-
109+
//OCPBUGS-61508 set at every reconcile the ingress_controller_conditions metrics
110+
SetIngressControllerConditionsMetric(updated)
111111
return retryableerror.NewMaybeRetryableAggregate(errs), updatedIc
112112
}
113113

test/e2e/all_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,5 +136,6 @@ func TestAll(t *testing.T) {
136136
// Serializing the test ensures it runs in isolation with other tests,
137137
// preventing any impact of the mutating webhook on pod creation in the cluster
138138
t.Run("TestGatewayAPI", TestGatewayAPI)
139+
t.Run("TestIngressControllerConditionsMetricAfterRestart", TestIngressControllerConditionsMetricAfterRestart)
139140
})
140141
}
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
//go:build e2e
2+
// +build e2e
3+
4+
package e2e
5+
6+
import (
7+
"context"
8+
"fmt"
9+
"strings"
10+
"testing"
11+
"time"
12+
13+
routev1client "github.com/openshift/client-go/route/clientset/versioned"
14+
"github.com/openshift/library-go/test/library/metrics"
15+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
16+
"github.com/prometheus/common/model"
17+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
"k8s.io/apimachinery/pkg/util/wait"
19+
"k8s.io/client-go/kubernetes"
20+
"sigs.k8s.io/controller-runtime/pkg/client/config"
21+
)
22+
23+
// waitForIngressControllerConditionsMetrics waits for the metrics for ingress_controller_conditions to be present.
24+
func waitForIngressControllerConditionsMetrics(t *testing.T, prometheusClient prometheusv1.API, start time.Time) error {
25+
t.Logf("Waiting for ingress_controller_conditions to be present")
26+
if err := wait.PollUntilContextTimeout(context.TODO(), 1*time.Second, 2*time.Minute, false, func(context context.Context) (bool, error) {
27+
r := prometheusv1.Range{
28+
Start: start,
29+
End: time.Now(),
30+
Step: time.Minute,
31+
}
32+
result, _, err := prometheusClient.QueryRange(context, "ingress_controller_conditions", r)
33+
if err != nil {
34+
t.Logf("Failed to fetch metrics: %v, retrying...", err)
35+
return false, nil
36+
}
37+
38+
// Check if fetched metrics is of Vector type.
39+
matrix, ok := result.(model.Matrix)
40+
if !ok {
41+
t.Logf("Unexpected metric type, retrying...")
42+
return false, nil
43+
}
44+
45+
// Check if length of returned metric Vector is zero.
46+
if len(matrix) == 0 {
47+
t.Logf("Metric is empty, retrying...")
48+
return false, nil
49+
}
50+
51+
return true, nil
52+
}); err != nil {
53+
return fmt.Errorf("Error waiting for route metrics: %w", err)
54+
}
55+
return nil
56+
}
57+
func restartOperatorPod(t *testing.T, kubeClient kubernetes.Interface) time.Time {
58+
// Find the operator pod
59+
t.Logf("Restarting Ingress operator pod...")
60+
podsList, err := kubeClient.CoreV1().Pods("openshift-ingress-operator").List(context.Background(), metav1.ListOptions{})
61+
if err != nil {
62+
t.Fatalf("Failed listing pods in openshift-ingress-operator namespace: %v", err)
63+
}
64+
operatorPodName := ""
65+
for _, pod := range podsList.Items {
66+
if strings.HasPrefix(pod.Name, "ingress-operator") {
67+
operatorPodName = pod.Name
68+
break
69+
}
70+
}
71+
if operatorPodName == "" {
72+
t.Fatalf("Unable to find ingress operator pod")
73+
}
74+
// Delete the operator pod
75+
if err := kubeClient.CoreV1().Pods("openshift-ingress-operator").Delete(context.TODO(), operatorPodName, metav1.DeleteOptions{}); err != nil {
76+
t.Fatalf("failed to find the ingresscontroller operator pods: %v", err)
77+
}
78+
79+
interval, timeout := 5*time.Second, 5*time.Minute
80+
// Wait for new pod to be ready
81+
t.Logf("Polling for up to %v to verify that the oprator restart is terminated...", timeout)
82+
if err := wait.PollUntilContextTimeout(context.Background(), interval, timeout, false, func(context context.Context) (bool, error) {
83+
if podsList, err = kubeClient.CoreV1().Pods("openshift-ingress-operator").List(context, metav1.ListOptions{}); err != nil || len(podsList.Items) == 0 {
84+
return false, err
85+
}
86+
return true, nil
87+
}); err != nil {
88+
t.Fatalf("Operator pod did not restart in time")
89+
}
90+
return time.Now()
91+
}
92+
93+
// TestIngressControllerConditionsMetricAfterRestart verifies that metric ingress_controller_conditions(router,status) is
94+
// available after an operator pod restart too.
95+
//
96+
// This test:
97+
//
98+
// 1. Verifies that the metric is available in a normal situation when the operator pod is up&running (i.e. before restart)
99+
//
100+
// 2. Restarts the operator pod, waiting it will be available again
101+
//
102+
// 3. Repeats the step 1 expecting the same result again and so the presence of the metric
103+
//
104+
// NB:
105+
// 1. this test requires an OpenShift version with the monitoring stack up&running
106+
// 2. due to the fact that this test is restarting the operator pod it cannot be executed in parallel with other tests
107+
func TestIngressControllerConditionsMetricAfterRestart(t *testing.T) {
108+
metricName := "ingress_controller_conditions"
109+
110+
// Create a new prometheus client for fetching metrics and dependencies needed
111+
kubeConfig, err := config.GetConfig()
112+
if err != nil {
113+
t.Fatalf("Failed to get kube config: %s", err)
114+
}
115+
kubeClient, err := kubernetes.NewForConfig(kubeConfig)
116+
if err != nil {
117+
t.Fatalf("Failed to create kube client: %v", err)
118+
}
119+
routeClient, err := routev1client.NewForConfig(kubeConfig)
120+
if err != nil {
121+
t.Fatalf("Failed to create route client: %v", err)
122+
}
123+
prometheusClient, err := metrics.NewPrometheusClient(context.Background(), kubeClient, routeClient)
124+
if err != nil {
125+
t.Fatalf("Failed to create prometheus client: %v", err)
126+
}
127+
128+
// Check metric before restart
129+
t.Logf("Verifying that in Prometheus metrics there are %s metrics before resart", metricName)
130+
// Wait for metrics to be added and set to 0.
131+
if err := waitForIngressControllerConditionsMetrics(t, prometheusClient, time.Now().Add(-5*time.Minute)); err != nil {
132+
t.Fatalf("Failed to fetch expected metrics: %v", err)
133+
}
134+
135+
// Restart operator pod
136+
restartTime := restartOperatorPod(t, kubeClient)
137+
138+
// Check metric after restart
139+
if err := waitForIngressControllerConditionsMetrics(t, prometheusClient, restartTime); err != nil {
140+
t.Fatalf("Failed to fetch expected metrics: %v", err)
141+
}
142+
}

0 commit comments

Comments
 (0)