From 79c11fe326ec916f7c3513c51f4fad2020adcf3f Mon Sep 17 00:00:00 2001
From: Davide Salerno <dsalerno@redhat.com>
Date: Fri, 3 Oct 2025 12:41:22 +0200
Subject: [PATCH] OCPBUGS-61508: IngressOperator not exposing some metrics for
 degraded IngressController

Signed-off-by: Davide Salerno <dsalerno@redhat.com>
---
 pkg/operator/controller/ingress/status.go |   4 +-
 test/e2e/all_test.go                      |   1 +
 test/e2e/ic_conditions_metric_test.go     | 185 ++++++++++++++++++++++
 3 files changed, 188 insertions(+), 2 deletions(-)
 create mode 100644 test/e2e/ic_conditions_metric_test.go

diff --git a/pkg/operator/controller/ingress/status.go b/pkg/operator/controller/ingress/status.go
index e0df9564d5..ee667355d7 100644
--- a/pkg/operator/controller/ingress/status.go
+++ b/pkg/operator/controller/ingress/status.go
@@ -106,10 +106,10 @@ func (r *reconciler) syncIngressControllerStatus(ic *operatorv1.IngressControlle
 			errs = append(errs, fmt.Errorf("failed to update ingresscontroller status: %v", err))
 		} else {
 			updatedIc = true
-			SetIngressControllerConditionsMetric(updated)
 		}
 	}
-
+	//OCPBUGS-61508 set at every reconcile the ingress_controller_conditions metrics
+	SetIngressControllerConditionsMetric(updated)
 	return retryableerror.NewMaybeRetryableAggregate(errs), updatedIc
 }
 
diff --git a/test/e2e/all_test.go b/test/e2e/all_test.go
index 2d499afd3b..5b0f29af42 100644
--- a/test/e2e/all_test.go
+++ b/test/e2e/all_test.go
@@ -136,5 +136,6 @@ func TestAll(t *testing.T) {
 		// Serializing the test ensures it runs in isolation with other tests,
 		// preventing any impact of the mutating webhook on pod creation in the cluster
 		t.Run("TestGatewayAPI", TestGatewayAPI)
+		t.Run("TestIngressControllerConditionsMetricAfterRestart", TestIngressControllerConditionsMetricAfterRestart)
 	})
 }
diff --git a/test/e2e/ic_conditions_metric_test.go b/test/e2e/ic_conditions_metric_test.go
new file mode 100644
index 0000000000..7a777292df
--- /dev/null
+++ b/test/e2e/ic_conditions_metric_test.go
@@ -0,0 +1,185 @@
+//go:build e2e
+// +build e2e
+
+package e2e
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+
+	routev1client "github.com/openshift/client-go/route/clientset/versioned"
+	"github.com/openshift/library-go/test/library/metrics"
+	prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
+	"github.com/prometheus/common/model"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/kubernetes"
+	"sigs.k8s.io/controller-runtime/pkg/client/config"
+)
+
+// waitForIngressControllerConditionsMetrics waits for the metrics for ingress_controller_conditions to be present.
+func waitForIngressControllerConditionsMetrics(t *testing.T, prometheusClient prometheusv1.API) error {
+	t.Logf("Waiting for ingress_controller_conditions to be present")
+	if err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 2*time.Minute, false, func(context context.Context) (bool, error) {
+		result, _, err := prometheusClient.Query(context, "ingress_controller_conditions", time.Now())
+		if err != nil {
+			t.Logf("Failed to fetch metrics: %v, retrying...", err)
+			return false, nil
+		}
+
+		// Check if fetched metrics is of Vector type.
+		vector, ok := result.(model.Vector)
+		if !ok {
+			t.Logf("Unexpected metric type, retrying...")
+			return false, nil
+		}
+
+		// Check if length of returned metric Vector is zero.
+		if len(vector) == 0 {
+			t.Logf("Metric is empty, retrying...")
+			return false, nil
+		}
+
+		return true, nil
+	}); err != nil {
+		return fmt.Errorf("Error waiting for ingress controller metrics: %w", err)
+	}
+	return nil
+}
+
+// restartOperatorPod will restart the operator pod, and if some error occurs it returns it
+func restartOperatorPod(t *testing.T, kubeClient kubernetes.Interface) error {
+	interval, timeout := 5*time.Second, 5*time.Minute
+	var podsList *corev1.PodList
+
+	// Find the operator pod
+	t.Logf("Restarting Ingress operator pod...")
+	if err := wait.PollUntilContextTimeout(context.Background(), interval, timeout, false, func(context context.Context) (bool, error) {
+		innerPodsList, err := kubeClient.CoreV1().Pods("openshift-ingress-operator").List(context, metav1.ListOptions{})
+		podsList = innerPodsList
+		if err != nil {
+			t.Logf("Failed to list pods: %v, retying...", err)
+			return false, nil
+		}
+		return innerPodsList != nil && len(innerPodsList.Items) > 0, nil
+	}); err != nil {
+		return err
+	}
+	operatorPodName, operatorPodUID := extractIngressOperatorPodNameAndUID(podsList)
+
+	if operatorPodName == "" || operatorPodUID == "" {
+		return errors.New(("Unable to find ingress operator pod"))
+	}
+
+	// Delete the operator pod
+	if err := wait.PollUntilContextTimeout(context.Background(), interval, timeout, false, func(context context.Context) (bool, error) {
+		if err := kubeClient.CoreV1().Pods("openshift-ingress-operator").Delete(context, operatorPodName, metav1.DeleteOptions{}); err != nil {
+			t.Logf("Failed to delete operator pod: %v, retying...", err)
+			return false, nil
+		}
+		return true, nil
+	}); err != nil {
+		return err
+	}
+
+	// Wait for new pod to be ready
+	t.Logf("Polling for up to %v to verify that the operator restart is terminated...", timeout)
+	if err := wait.PollUntilContextTimeout(context.Background(), interval, timeout, false, func(context context.Context) (bool, error) {
+		podsList, err := kubeClient.CoreV1().Pods("openshift-ingress-operator").List(context, metav1.ListOptions{})
+		if err != nil {
+			t.Logf("Failed to list pods: %v, retying...", err)
+			return false, nil
+		}
+
+		if podsList == nil || len(podsList.Items) == 0 {
+			t.Logf("No pods found, retying...")
+			return false, nil
+		}
+		//We are checking that the new pod is different from the previous one
+		_, newOperatorPodUID := extractIngressOperatorPodNameAndUID(podsList)
+		if newOperatorPodUID == "" {
+			t.Logf("Failed to find ingress operator pod, retrying...")
+			return false, nil
+		}
+		if newOperatorPodUID == operatorPodUID {
+			t.Logf("Failed to find new ingress operator pod, retrying...")
+			return false, nil
+		}
+		return true, nil
+	}); err != nil {
+		return err
+	}
+	return nil
+}
+
+// extractIngressOperatorPodNameAndUID helper function to extract from a pods list the name and UID of the ingress operator pod
+func extractIngressOperatorPodNameAndUID(podsList *corev1.PodList) (string, types.UID) {
+	operatorPodName := ""
+	operatorPodUID := types.UID("")
+	for _, pod := range podsList.Items {
+		if strings.HasPrefix(pod.Name, "ingress-operator") {
+			operatorPodName = pod.Name
+			operatorPodUID = pod.UID
+			break
+		}
+	}
+	return operatorPodName, operatorPodUID
+}
+
+// TestIngressControllerConditionsMetricAfterRestart verifies that metric ingress_controller_conditions(router,status) is
+// available after an operator pod restart too.
+//
+// This test:
+//
+// 1. Verifies that the metric is available in a normal situation when the operator pod is up&running (i.e. before restart)
+//
+// 2. Restarts the operator pod, waiting it will be available again
+//
+// 3. Repeats the step 1 expecting the same result again and so the presence of the metric
+//
+// NB:
+//  1. this test requires an OpenShift version with the monitoring stack up&running
+//  2. due to the fact that this test is restarting the operator pod it cannot be executed in parallel with other tests
+func TestIngressControllerConditionsMetricAfterRestart(t *testing.T) {
+
+	// Create a new prometheus client for fetching metrics and dependencies needed
+	kubeConfig, err := config.GetConfig()
+	if err != nil {
+		t.Fatalf("Failed to get kube config: %s", err)
+	}
+	kubeClient, err := kubernetes.NewForConfig(kubeConfig)
+	if err != nil {
+		t.Fatalf("Failed to create kube client: %v", err)
+	}
+	routeClient, err := routev1client.NewForConfig(kubeConfig)
+	if err != nil {
+		t.Fatalf("Failed to create route client: %v", err)
+	}
+	prometheusClient, err := metrics.NewPrometheusClient(context.Background(), kubeClient, routeClient)
+	if err != nil {
+		t.Fatalf("Failed to create prometheus client: %v", err)
+	}
+
+	// Check metric before restart
+	t.Log("Verifying that in Prometheus metrics there are ingress_controller_conditions metrics before restart")
+	// Wait for metrics to be added and set to 0.
+	if err := waitForIngressControllerConditionsMetrics(t, prometheusClient); err != nil {
+		t.Fatalf("Failed to fetch expected metrics: %v", err)
+	}
+
+	// Restart operator pod
+	if err := restartOperatorPod(t, kubeClient); err != nil {
+		t.Fatalf("Failed to restart operator pod: %v", err)
+	}
+
+	// Check metric after restart
+	if err := waitForIngressControllerConditionsMetrics(t, prometheusClient); err != nil {
+		t.Fatalf("Failed to fetch expected metrics: %v", err)
+	}
+}