Skip to content

Commit 8773f8f

Browse files
Merge pull request #1290 from davidesalerno/ocpbugs61508onmaster
OCPBUGS-61508: IngressOperator not exposing some metrics for degraded…
2 parents 072c1cd + 00810f9 commit 8773f8f

File tree

3 files changed

+188
-2
lines changed

3 files changed

+188
-2
lines changed

pkg/operator/controller/ingress/status.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,10 @@ func (r *reconciler) syncIngressControllerStatus(ic *operatorv1.IngressControlle
104104
errs = append(errs, fmt.Errorf("failed to update ingresscontroller status: %v", err))
105105
} else {
106106
updatedIc = true
107-
SetIngressControllerConditionsMetric(updated)
108107
}
109108
}
110-
109+
//OCPBUGS-61508 set at every reconcile the ingress_controller_conditions metrics
110+
SetIngressControllerConditionsMetric(updated)
111111
return retryableerror.NewMaybeRetryableAggregate(errs), updatedIc
112112
}
113113

test/e2e/all_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,5 +136,6 @@ func TestAll(t *testing.T) {
136136
// Serializing the test ensures it runs in isolation with other tests,
137137
// preventing any impact of the mutating webhook on pod creation in the cluster
138138
t.Run("TestGatewayAPI", TestGatewayAPI)
139+
t.Run("TestIngressControllerConditionsMetricAfterRestart", TestIngressControllerConditionsMetricAfterRestart)
139140
})
140141
}
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
//go:build e2e
2+
// +build e2e
3+
4+
package e2e
5+
6+
import (
7+
"context"
8+
"errors"
9+
"fmt"
10+
"strings"
11+
"testing"
12+
"time"
13+
14+
routev1client "github.com/openshift/client-go/route/clientset/versioned"
15+
"github.com/openshift/library-go/test/library/metrics"
16+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
17+
"github.com/prometheus/common/model"
18+
corev1 "k8s.io/api/core/v1"
19+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
20+
"k8s.io/apimachinery/pkg/types"
21+
"k8s.io/apimachinery/pkg/util/wait"
22+
"k8s.io/client-go/kubernetes"
23+
"sigs.k8s.io/controller-runtime/pkg/client/config"
24+
)
25+
26+
// waitForIngressControllerConditionsMetrics waits for the metrics for ingress_controller_conditions to be present.
27+
func waitForIngressControllerConditionsMetrics(t *testing.T, prometheusClient prometheusv1.API) error {
28+
t.Logf("Waiting for ingress_controller_conditions to be present")
29+
if err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 2*time.Minute, false, func(context context.Context) (bool, error) {
30+
result, _, err := prometheusClient.Query(context, "ingress_controller_conditions", time.Now())
31+
if err != nil {
32+
t.Logf("Failed to fetch metrics: %v, retrying...", err)
33+
return false, nil
34+
}
35+
36+
// Check if fetched metrics is of Vector type.
37+
vector, ok := result.(model.Vector)
38+
if !ok {
39+
t.Logf("Unexpected metric type, retrying...")
40+
return false, nil
41+
}
42+
43+
// Check if length of returned metric Vector is zero.
44+
if len(vector) == 0 {
45+
t.Logf("Metric is empty, retrying...")
46+
return false, nil
47+
}
48+
49+
return true, nil
50+
}); err != nil {
51+
return fmt.Errorf("Error waiting for ingress controller metrics: %w", err)
52+
}
53+
return nil
54+
}
55+
56+
// restartOperatorPod will restart the operator pod, and if some error occurs it returns it
57+
func restartOperatorPod(t *testing.T, kubeClient kubernetes.Interface) error {
58+
interval, timeout := 5*time.Second, 5*time.Minute
59+
var podsList *corev1.PodList
60+
61+
// Find the operator pod
62+
t.Logf("Restarting Ingress operator pod...")
63+
if err := wait.PollUntilContextTimeout(context.Background(), interval, timeout, false, func(context context.Context) (bool, error) {
64+
innerPodsList, err := kubeClient.CoreV1().Pods("openshift-ingress-operator").List(context, metav1.ListOptions{})
65+
podsList = innerPodsList
66+
if err != nil {
67+
t.Logf("Failed to list pods: %v, retying...", err)
68+
return false, nil
69+
}
70+
return innerPodsList != nil && len(innerPodsList.Items) > 0, nil
71+
}); err != nil {
72+
return err
73+
}
74+
operatorPodName, operatorPodUID := extractIngressOperatorPodNameAndUID(podsList)
75+
76+
if operatorPodName == "" || operatorPodUID == "" {
77+
return errors.New(("Unable to find ingress operator pod"))
78+
}
79+
80+
// Delete the operator pod
81+
if err := wait.PollUntilContextTimeout(context.Background(), interval, timeout, false, func(context context.Context) (bool, error) {
82+
if err := kubeClient.CoreV1().Pods("openshift-ingress-operator").Delete(context, operatorPodName, metav1.DeleteOptions{}); err != nil {
83+
t.Logf("Failed to delete operator pod: %v, retying...", err)
84+
return false, nil
85+
}
86+
return true, nil
87+
}); err != nil {
88+
return err
89+
}
90+
91+
// Wait for new pod to be ready
92+
t.Logf("Polling for up to %v to verify that the operator restart is terminated...", timeout)
93+
if err := wait.PollUntilContextTimeout(context.Background(), interval, timeout, false, func(context context.Context) (bool, error) {
94+
podsList, err := kubeClient.CoreV1().Pods("openshift-ingress-operator").List(context, metav1.ListOptions{})
95+
if err != nil {
96+
t.Logf("Failed to list pods: %v, retying...", err)
97+
return false, nil
98+
}
99+
100+
if podsList == nil || len(podsList.Items) == 0 {
101+
t.Logf("No pods found, retying...")
102+
return false, nil
103+
}
104+
//We are checking that the new pod is different from the previous one
105+
_, newOperatorPodUID := extractIngressOperatorPodNameAndUID(podsList)
106+
if newOperatorPodUID == "" {
107+
t.Logf("Failed to find ingress operator pod, retrying...")
108+
return false, nil
109+
}
110+
if newOperatorPodUID == operatorPodUID {
111+
t.Logf("Failed to find new ingress operator pod, retrying...")
112+
return false, nil
113+
}
114+
return true, nil
115+
}); err != nil {
116+
return err
117+
}
118+
return nil
119+
}
120+
121+
// extractIngressOperatorPodNameAndUID helper function to extract from a pods list the name and UID of the ingress operator pod
122+
func extractIngressOperatorPodNameAndUID(podsList *corev1.PodList) (string, types.UID) {
123+
operatorPodName := ""
124+
operatorPodUID := types.UID("")
125+
for _, pod := range podsList.Items {
126+
if strings.HasPrefix(pod.Name, "ingress-operator") {
127+
operatorPodName = pod.Name
128+
operatorPodUID = pod.UID
129+
break
130+
}
131+
}
132+
return operatorPodName, operatorPodUID
133+
}
134+
135+
// TestIngressControllerConditionsMetricAfterRestart verifies that metric ingress_controller_conditions(router,status) is
136+
// available after an operator pod restart too.
137+
//
138+
// This test:
139+
//
140+
// 1. Verifies that the metric is available in a normal situation when the operator pod is up&running (i.e. before restart)
141+
//
142+
// 2. Restarts the operator pod, waiting it will be available again
143+
//
144+
// 3. Repeats the step 1 expecting the same result again and so the presence of the metric
145+
//
146+
// NB:
147+
// 1. this test requires an OpenShift version with the monitoring stack up&running
148+
// 2. due to the fact that this test is restarting the operator pod it cannot be executed in parallel with other tests
149+
func TestIngressControllerConditionsMetricAfterRestart(t *testing.T) {
150+
151+
// Create a new prometheus client for fetching metrics and dependencies needed
152+
kubeConfig, err := config.GetConfig()
153+
if err != nil {
154+
t.Fatalf("Failed to get kube config: %s", err)
155+
}
156+
kubeClient, err := kubernetes.NewForConfig(kubeConfig)
157+
if err != nil {
158+
t.Fatalf("Failed to create kube client: %v", err)
159+
}
160+
routeClient, err := routev1client.NewForConfig(kubeConfig)
161+
if err != nil {
162+
t.Fatalf("Failed to create route client: %v", err)
163+
}
164+
prometheusClient, err := metrics.NewPrometheusClient(context.Background(), kubeClient, routeClient)
165+
if err != nil {
166+
t.Fatalf("Failed to create prometheus client: %v", err)
167+
}
168+
169+
// Check metric before restart
170+
t.Log("Verifying that in Prometheus metrics there are ingress_controller_conditions metrics before restart")
171+
// Wait for metrics to be added and set to 0.
172+
if err := waitForIngressControllerConditionsMetrics(t, prometheusClient); err != nil {
173+
t.Fatalf("Failed to fetch expected metrics: %v", err)
174+
}
175+
176+
// Restart operator pod
177+
if err := restartOperatorPod(t, kubeClient); err != nil {
178+
t.Fatalf("Failed to restart operator pod: %v", err)
179+
}
180+
181+
// Check metric after restart
182+
if err := waitForIngressControllerConditionsMetrics(t, prometheusClient); err != nil {
183+
t.Fatalf("Failed to fetch expected metrics: %v", err)
184+
}
185+
}

0 commit comments

Comments
 (0)