diff --git a/Makefile b/Makefile index 7165884f9..e49482c03 100644 --- a/Makefile +++ b/Makefile @@ -1552,10 +1552,6 @@ e2e-helm-test: $(BUILD_PROPS) $(BUILD_HELM)/coherence-operator-$(VERSION).tgz un # ---------------------------------------------------------------------------------------------------------------------- # Executes the Go end-to-end tests that require Prometheus in the k8s cluster -# using a LOCAL operator instance (i.e. the operator is not deployed to k8s). -# -# This target DOES NOT install Prometheus, use the e2e-prometheus-test target -# to fully reset the test namespace. # # These tests will use whichever k8s cluster the local environment # is pointing to. @@ -1563,6 +1559,7 @@ e2e-helm-test: $(BUILD_PROPS) $(BUILD_HELM)/coherence-operator-$(VERSION).tgz un .PHONY: e2e-prometheus-test e2e-prometheus-test: export MF = $(MAKEFLAGS) e2e-prometheus-test: reset-namespace install-prometheus create-ssl-secrets ensure-pull-secret deploy-and-wait ## Run the Operator metrics/Prometheus end-to-end functional tests + sleep 10 $(MAKE) run-prometheus-test $${MF} \ ; rc=$$? \ ; $(MAKE) uninstall-prometheus $${MF} \ diff --git a/config/components/restricted/cluster_role.yaml b/config/components/restricted/cluster_role.yaml new file mode 100644 index 000000000..db920731b --- /dev/null +++ b/config/components/restricted/cluster_role.yaml @@ -0,0 +1,5 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: crd-webhook-install-role +$patch: delete diff --git a/config/components/restricted/cluster_role_binding.yaml b/config/components/restricted/cluster_role_binding.yaml new file mode 100644 index 000000000..081d34787 --- /dev/null +++ b/config/components/restricted/cluster_role_binding.yaml @@ -0,0 +1,9 @@ +# -------------------------------------------------------------------- +# This is the Cluster Role binding required by the Coherence Operator +# to self-manage its CRDs and Web-Hooks. +# -------------------------------------------------------------------- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: crd-webhook-install-rolebinding +$patch: delete diff --git a/config/components/restricted/kustomization.yaml b/config/components/restricted/kustomization.yaml index 64356f22e..999a0ece4 100644 --- a/config/components/restricted/kustomization.yaml +++ b/config/components/restricted/kustomization.yaml @@ -8,3 +8,5 @@ patches: name: controller-manager - path: node-viewer-role.yaml - path: node_viewer_role_binding.yaml + - path: cluster_role.yaml + - path: cluster_role_binding.yaml diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml deleted file mode 100644 index 2aaef6536..000000000 --- a/config/default/manager_metrics_patch.yaml +++ /dev/null @@ -1,4 +0,0 @@ -# This patch adds the args to allow exposing the metrics endpoint using HTTPS -- op: add - path: /spec/template/spec/containers/0/args/0 - value: --metrics-bind-address=:8443 diff --git a/config/default/metrics_service.yaml b/config/default/metrics_service.yaml index 5bc256c0b..5a1be95e2 100644 --- a/config/default/metrics_service.yaml +++ b/config/default/metrics_service.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: Service metadata: name: metrics-service - namespace: system + namespace: default spec: ports: - name: https diff --git a/config/network-policy/allow-metrics-traffic.yaml b/config/network-policy/allow-metrics-traffic.yaml index 98d47047a..2f2c5a82d 100644 --- a/config/network-policy/allow-metrics-traffic.yaml +++ b/config/network-policy/allow-metrics-traffic.yaml @@ -11,7 +11,7 @@ metadata: app.kubernetes.io/version: "3.5.6" app.kubernetes.io/part-of: coherence-operator name: allow-metrics-traffic - namespace: system + namespace: default spec: podSelector: matchLabels: diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml index 8c10777cb..3ad56b326 100644 --- a/config/prometheus/monitor.yaml +++ b/config/prometheus/monitor.yaml @@ -9,7 +9,7 @@ metadata: app.kubernetes.io/version: "3.5.6" app.kubernetes.io/part-of: coherence-operator name: controller-manager-metrics-monitor - namespace: system + namespace: default spec: endpoints: - path: /metrics diff --git a/config/rbac/cluster_role.yaml b/config/rbac/cluster_role.yaml index 4178e28d1..ecdd20038 100644 --- a/config/rbac/cluster_role.yaml +++ b/config/rbac/cluster_role.yaml @@ -1,6 +1,14 @@ # ------------------------------------------------------------- # This is the Cluster Roles required by the Coherence Operator # to self-manage its CRDs and Web-Hooks. +# +# The Operator no longer installs its own CRDs so this role +# is now only used by the operator to ensure any existing +# webhook config gets removed when the operator starts. +# +# If the operator has never been installed with web hooks +# enabled, or the web hook config has been manually removed, +# then this role is not required. # ------------------------------------------------------------- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 76efc18e4..b737e6799 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -9,6 +9,8 @@ resources: - role_binding.yaml - node_viewer_role.yaml - node_viewer_role_binding.yaml + - cluster_role.yaml + - cluster_role_binding.yaml - leader_election_role.yaml - leader_election_role_binding.yaml # The following RBAC configurations are used to protect diff --git a/config/rbac/metrics_auth_role_binding.yaml b/config/rbac/metrics_auth_role_binding.yaml index 33f2b0dbf..201e87eaf 100644 --- a/config/rbac/metrics_auth_role_binding.yaml +++ b/config/rbac/metrics_auth_role_binding.yaml @@ -15,4 +15,4 @@ roleRef: subjects: - kind: ServiceAccount name: controller-manager - namespace: system + namespace: default diff --git a/hack/prometheus/prometheus-rbac.yaml b/hack/prometheus/prometheus-rbac.yaml index 6eb629cd0..af2587d96 100644 --- a/hack/prometheus/prometheus-rbac.yaml +++ b/hack/prometheus/prometheus-rbac.yaml @@ -17,6 +17,11 @@ rules: - endpoints - pods verbs: ["get", "list", "watch"] +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: ["get", "list", "watch"] - apiGroups: [""] resources: - configmaps diff --git a/pkg/runner/cmd_operator.go b/pkg/runner/cmd_operator.go index d0d349665..891c4ca64 100644 --- a/pkg/runner/cmd_operator.go +++ b/pkg/runner/cmd_operator.go @@ -93,8 +93,6 @@ func execute(v *viper.Viper) error { c.NextProtos = []string{"http/1.1"} } - vpr := operator.GetViper() - var tlsOpts []func(*tls.Config) suiteConfig, err := operator.NewCipherSuiteConfig(v, setupLog) if err != nil { @@ -102,11 +100,12 @@ func execute(v *viper.Viper) error { } tlsOpts = append(tlsOpts, suiteConfig) - enableHTTP2 := vpr.GetBool(operator.FlagEnableHttp2) + enableHTTP2 := v.GetBool(operator.FlagEnableHttp2) if !enableHTTP2 { tlsOpts = append(tlsOpts, disableHTTP2) } + setupLog.Info("Obtaining kubernetes client config") cfg := ctrl.GetConfigOrDie() cfg.WrapTransport = func(rt http.RoundTripper) http.RoundTripper { t := rt.(*http.Transport) @@ -114,24 +113,44 @@ func execute(v *viper.Viper) error { return rt } + setupLog.Info("Creating kubernetes client") cs, err := clients.NewForConfig(cfg) if err != nil { return errors.Wrap(err, "unable to create client set") } - // The Operator web-hook server has been removed so we need to delete any existing web-hooks + version, err := cs.DiscoveryClient.ServerVersion() + if err != nil { + return errors.Wrap(err, "unable to get kubernetes server version") + } + setupLog.Info("Kubernetes server version", "Major", version.Major, "Minor", version.Minor, "Platform", version.Platform) + + // The Operator web-hook server has been removed, so we need to delete any existing web-hooks + setupLog.Info("Ensuring any existing webhook configurations are removed") cl := cs.KubeClient.AdmissionregistrationV1() // we ignore any errors - _ = cl.MutatingWebhookConfigurations().Delete(context.Background(), operator.DefaultMutatingWebhookName, metav1.DeleteOptions{}) - _ = cl.ValidatingWebhookConfigurations().Delete(context.Background(), operator.DefaultValidatingWebhookName, metav1.DeleteOptions{}) + _, err = cl.MutatingWebhookConfigurations().Get(context.Background(), operator.DefaultMutatingWebhookName, metav1.GetOptions{}) + if err == nil { + // found web hook + setupLog.Info("Deleting existing MutatingWebhookConfigurations", "Names", operator.DefaultMutatingWebhookName) + _ = cl.MutatingWebhookConfigurations().Delete(context.Background(), operator.DefaultMutatingWebhookName, metav1.DeleteOptions{}) + } + _, err = cl.ValidatingWebhookConfigurations().Get(context.Background(), operator.DefaultValidatingWebhookName, metav1.GetOptions{}) + if err == nil { + // found web hook + setupLog.Info("Deleting existing ValidatingWebhookConfigurations", "Names", operator.DefaultValidatingWebhookName) + _ = cl.ValidatingWebhookConfigurations().Delete(context.Background(), operator.DefaultValidatingWebhookName, metav1.DeleteOptions{}) + } + setupLog.Info("Done ensuring any existing webhook configurations are removed") dryRun := operator.IsDryRun() - secureMetrics := vpr.GetBool(operator.FlagSecureMetrics) + secureMetrics := v.GetBool(operator.FlagSecureMetrics) // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. // More info: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.0/pkg/metrics/server // - https://book.kubebuilder.io/reference/metrics.html + setupLog.Info("Configuring operator metrics", "Secure", secureMetrics) metricsServerOptions := metricsserver.Options{ BindAddress: viper.GetString(operator.FlagMetricsAddress), SecureServing: secureMetrics, @@ -202,12 +221,14 @@ func execute(v *viper.Viper) error { } } + setupLog.Info("Creating controller manager") mgr, err := manager.New(cfg, options) if err != nil { return errors.Wrap(err, "unable to create controller manager") } // Set up the Coherence reconciler + setupLog.Info("Setting up Coherence reconciler") if err = (&controllers.CoherenceReconciler{ Client: mgr.GetClient(), ClientSet: cs, @@ -219,6 +240,7 @@ func execute(v *viper.Viper) error { // Set up the CoherenceJob reconciler if operator.ShouldSupportCoherenceJob() { + setupLog.Info("Setting up CoherenceJob reconciler") if err = (&controllers.CoherenceJobReconciler{ Client: mgr.GetClient(), ClientSet: cs, @@ -258,6 +280,8 @@ func execute(v *viper.Viper) error { setupLog.Error(err, "problem running manager") os.Exit(1) } + } else { + setupLog.Info("Operator is running in dry-run mode") } return nil diff --git a/test/e2e/helper/e2e-helpers.go b/test/e2e/helper/e2e-helpers.go index d9ff849ad..736cbb47d 100644 --- a/test/e2e/helper/e2e-helpers.go +++ b/test/e2e/helper/e2e-helpers.go @@ -23,6 +23,7 @@ import ( appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + discv1 "k8s.io/api/discovery/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/runtime" @@ -1725,16 +1726,6 @@ func AssertDeploymentsInNamespace(ctx TestContext, t *testing.T, yamlFile, names // assert that the correct number of Pods is returned g.Expect(len(pods)).To(Equal(expectedClusterSize)) - // Verify that the WKA service has the same number of endpoints as the cluster size. - serviceName := deployments[0].GetWkaServiceName() - - ep, err := ctx.KubeClient.CoreV1().Endpoints(namespace).Get(ctx.Context, serviceName, metav1.GetOptions{}) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(len(ep.Subsets)).NotTo(BeZero()) - - subset := ep.Subsets[0] - g.Expect(len(subset.Addresses)).To(Equal(expectedWkaSize)) - m := make(map[string]coh.Coherence) for _, d := range deployments { opts := client.ObjectKey{Namespace: namespace, Name: d.Name} @@ -1744,37 +1735,33 @@ func AssertDeploymentsInNamespace(ctx TestContext, t *testing.T, yamlFile, names m[dpl.Name] = dpl } - // Obtain the expected WKA list of Pod IP addresses - var wkaPods []string - for _, d := range deployments { - if d.Spec.Coherence.IsWKAMember() { - pods, err := ListCoherencePodsForDeployment(ctx, d.Namespace, d.Name) - g.Expect(err).NotTo(HaveOccurred()) - for _, pod := range pods { - wkaPods = append(wkaPods, pod.Status.PodIP) - } - } - } - // Verify that the WKA service endpoints list for each deployment has all the required the Pod IP addresses. for _, d := range deployments { + // Verify that the WKA service has the same number of endpoints as the cluster size. serviceName := d.GetWkaServiceName() - ep, err = ctx.KubeClient.CoreV1().Endpoints(namespace).Get(ctx.Context, serviceName, metav1.GetOptions{}) + list, err := GetEndpointsForService(ctx, namespace, serviceName) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(len(ep.Subsets)).NotTo(BeZero()) - - subset := ep.Subsets[0] - g.Expect(len(subset.Addresses)).To(Equal(len(wkaPods))) - var actualWKA []string - for _, address := range subset.Addresses { - actualWKA = append(actualWKA, address.IP) - } - g.Expect(actualWKA).To(ConsistOf(wkaPods)) + g.Expect(len(list)).To(Equal(expectedWkaSize)) } return m, pods } +func GetEndpointsForService(ctx TestContext, namespace, name string) ([]discv1.Endpoint, error) { + var all []discv1.Endpoint + + list, err := ctx.KubeClient.DiscoveryV1().EndpointSlices(namespace).List(ctx.Context, metav1.ListOptions{}) + if err != nil { + return nil, err + } + for _, eps := range list.Items { + if svcName, found := eps.Labels["kubernetes.io/service-name"]; found && svcName == name { + all = append(all, eps.Endpoints...) + } + } + return all, err +} + // AssertCoherenceJobs tests that one or more CoherenceJobs can be created using the specified yaml. func AssertCoherenceJobs(ctx TestContext, t *testing.T, yamlFile string) (map[string]coh.CoherenceJob, []corev1.Pod) { return AssertCoherenceJobsInNamespace(ctx, t, yamlFile, GetTestNamespace()) diff --git a/test/e2e/helper/test_context.go b/test/e2e/helper/test_context.go index 0aca7ffa0..7408b0a21 100644 --- a/test/e2e/helper/test_context.go +++ b/test/e2e/helper/test_context.go @@ -9,6 +9,10 @@ package helper import ( "flag" "fmt" + "net/http" + "os" + "testing" + "github.com/go-logr/logr" coh "github.com/oracle/coherence-operator/api/v1" "github.com/oracle/coherence-operator/controllers" @@ -21,18 +25,14 @@ import ( "github.com/spf13/viper" "golang.org/x/net/context" corev1 "k8s.io/api/core/v1" - v1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" - "net/http" - "os" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" - "testing" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -199,7 +199,6 @@ func NewContext(startController bool, watchNamespaces ...string) (TestContext, e testEnv := &envtest.Environment{ UseExistingCluster: &useCluster, AttachControlPlaneOutput: true, - CRDs: []*v1.CustomResourceDefinition{}, } var err error diff --git a/test/e2e/prometheus/prometheus_test.go b/test/e2e/prometheus/prometheus_test.go index 13b4d5600..9a05242e6 100644 --- a/test/e2e/prometheus/prometheus_test.go +++ b/test/e2e/prometheus/prometheus_test.go @@ -10,21 +10,22 @@ import ( "context" "encoding/json" "fmt" + "io" + "net/http" + "strings" + "testing" + "time" + . "github.com/onsi/gomega" coh "github.com/oracle/coherence-operator/api/v1" "github.com/oracle/coherence-operator/test/e2e/helper" monitoring "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned/typed/monitoring/v1" - "io" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" - "net/http" - "strings" - "testing" - "time" ) func TestPrometheus(t *testing.T) { @@ -85,7 +86,9 @@ func ShouldGetPrometheusConfig(t *testing.T, pod corev1.Pod) { func ShouldEventuallySeeClusterMetrics(t *testing.T, promPod corev1.Pod, cohPods []corev1.Pod) { g := NewGomegaWithT(t) - err := wait.PollUntilContextTimeout(context.Background(), time.Second*20, time.Minute*15, true, func(context.Context) (done bool, err error) { + t.Logf("Waiting for Coherence cluster metrics to appear in Prometheus") + + err := wait.PollUntilContextTimeout(context.Background(), time.Second*20, time.Minute*2, true, func(context.Context) (done bool, err error) { result := PrometheusVector{} err = PrometheusQuery(t, promPod, "up", &result) if err != nil { @@ -164,6 +167,8 @@ func hasInterval(t *testing.T, sm *monitoring.ServiceMonitor) bool { func ShouldEventuallyHaveServiceMonitorWithState(t *testing.T, namespace, name string, predicate ServiceMonitorPredicate, promClient *client.MonitoringV1Client, retryInterval, timeout time.Duration) error { var sm *monitoring.ServiceMonitor + t.Logf("Waiting for ServiceMonitor resource %s/%s to be available", namespace, name) + err := wait.PollUntilContextTimeout(context.Background(), retryInterval, timeout, true, func(context.Context) (done bool, err error) { sm, err = promClient.ServiceMonitors(namespace).Get(testContext.Context, name, v1.GetOptions{}) if err != nil {