diff --git a/.chloggen/check-crd-availability.yaml b/.chloggen/check-crd-availability.yaml new file mode 100644 index 0000000000..b5ab294818 --- /dev/null +++ b/.chloggen/check-crd-availability.yaml @@ -0,0 +1,16 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action) +component: target allocator + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: check CRD availability before registering informers + +# One or more tracking issues related to the change +issues: [3987] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: diff --git a/apis/v1alpha1/targetallocator_webhook_test.go b/apis/v1alpha1/targetallocator_webhook_test.go index ffc33a1897..c7079b237c 100644 --- a/apis/v1alpha1/targetallocator_webhook_test.go +++ b/apis/v1alpha1/targetallocator_webhook_test.go @@ -237,6 +237,10 @@ func TestTargetAllocatorValidatingWebhook(t *testing.T) { "missing the following rules for system:serviceaccount:test-ns:test-ta-targetallocator - configmaps: [get]", "missing the following rules for system:serviceaccount:test-ns:test-ta-targetallocator - discovery.k8s.io/endpointslices: [get,list,watch]", "missing the following rules for system:serviceaccount:test-ns:test-ta-targetallocator - nonResourceURL: /metrics: [get]", + "missing the following rules for system:serviceaccount:test-ns:test-ta-targetallocator - nonResourceURL: /api: [get]", + "missing the following rules for system:serviceaccount:test-ns:test-ta-targetallocator - nonResourceURL: /api/*: [get]", + "missing the following rules for system:serviceaccount:test-ns:test-ta-targetallocator - nonResourceURL: /apis: [get]", + "missing the following rules for system:serviceaccount:test-ns:test-ta-targetallocator - nonResourceURL: /apis/*: [get]", }, }, { diff --git a/apis/v1beta1/collector_webhook_test.go b/apis/v1beta1/collector_webhook_test.go index e342146051..42598bf172 100644 --- a/apis/v1beta1/collector_webhook_test.go +++ b/apis/v1beta1/collector_webhook_test.go @@ -726,6 +726,10 @@ func TestOTELColValidatingWebhook(t *testing.T) { "missing the following rules for system:serviceaccount:test-ns:adm-warning-targetallocator - configmaps: [get]", "missing the following rules for system:serviceaccount:test-ns:adm-warning-targetallocator - discovery.k8s.io/endpointslices: [get,list,watch]", "missing the following rules for system:serviceaccount:test-ns:adm-warning-targetallocator - nonResourceURL: /metrics: [get]", + "missing the following rules for system:serviceaccount:test-ns:adm-warning-targetallocator - nonResourceURL: /api: [get]", + "missing the following rules for system:serviceaccount:test-ns:adm-warning-targetallocator - nonResourceURL: /api/*: [get]", + "missing the following rules for system:serviceaccount:test-ns:adm-warning-targetallocator - nonResourceURL: /apis: [get]", + "missing the following rules for system:serviceaccount:test-ns:adm-warning-targetallocator - nonResourceURL: /apis/*: [get]", }, }, { diff --git a/apis/v1beta1/targetallocator_rbac.go b/apis/v1beta1/targetallocator_rbac.go index e19851642b..09a0e78213 100644 --- a/apis/v1beta1/targetallocator_rbac.go +++ b/apis/v1beta1/targetallocator_rbac.go @@ -39,6 +39,9 @@ var ( }, { NonResourceURLs: []string{"/metrics"}, Verbs: []string{"get"}, + }, { + NonResourceURLs: []string{"/api", "/api/*", "/apis", "/apis/*"}, + Verbs: []string{"get"}, }, } ) diff --git a/cmd/otel-allocator/internal/watcher/promOperator.go b/cmd/otel-allocator/internal/watcher/promOperator.go index abf9e2839f..90058b4e9e 100644 --- a/cmd/otel-allocator/internal/watcher/promOperator.go +++ b/cmd/otel-allocator/internal/watcher/promOperator.go @@ -27,7 +27,10 @@ import ( "gopkg.in/yaml.v2" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/discovery" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/retry" @@ -54,7 +57,7 @@ func NewPrometheusCRWatcher( factory := informers.NewMonitoringInformerFactories(allowList, denyList, monitoringclient, allocatorconfig.DefaultResyncTime, nil) - monitoringInformers, err := getInformers(factory) + monitoringInformers, err := getInformers(factory, cfg.ClusterConfig, promLogger) if err != nil { return nil, err } @@ -185,34 +188,127 @@ func getNamespaceInformer(ctx context.Context, allowList, denyList map[string]st } +// checkCRDAvailability checks if a specific CRD is available in the cluster. +func checkCRDAvailability(dcl discovery.DiscoveryInterface, resourceName string) (bool, error) { + apiList, err := dcl.ServerGroups() + if err != nil { + return false, err + } + + apiGroups := apiList.Groups + for _, group := range apiGroups { + if group.Name == "monitoring.coreos.com" { + for _, version := range group.Versions { + resources, err := dcl.ServerResourcesForGroupVersion(version.GroupVersion) + if err != nil { + return false, err + } + + for _, resource := range resources.APIResources { + if resource.Name == resourceName { + return true, nil + } + } + } + } + } + + return false, nil +} + +// createInformerIfAvailable creates an informer for the given resource if the CRD is available, +// otherwise returns nil. If CRD availability cannot be checked due to permissions or other errors, +// it fails open and attempts to create the informer anyway. +func createInformerIfAvailable( + factory informers.FactoriesForNamespaces, + dcl discovery.DiscoveryInterface, + resourceName string, + groupVersion schema.GroupVersionResource, + logger *slog.Logger, +) (*informers.ForResource, error) { + available, err := checkCRDAvailability(dcl, resourceName) + if err != nil { + logger.Warn("Failed to check CRD availability, assuming CRD is available", "resource", resourceName, "error", err) + // Fail open: if we can't check availability, assume the CRD is available and try to create the informer + available = true + } + + if !available { + logger.Warn("CRD not available, skipping informer", "resource", resourceName) + return nil, nil + } + + informer, err := informers.NewInformersForResource(factory, groupVersion) + if err != nil { + return nil, fmt.Errorf("failed to create informer for %s: %w", resourceName, err) + } + + return informer, nil +} + // getInformers returns a map of informers for the given resources. -func getInformers(factory informers.FactoriesForNamespaces) (map[string]*informers.ForResource, error) { - serviceMonitorInformers, err := informers.NewInformersForResource(factory, monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.ServiceMonitorName)) +func getInformers(factory informers.FactoriesForNamespaces, clusterConfig *rest.Config, logger *slog.Logger) (map[string]*informers.ForResource, error) { + informersMap := make(map[string]*informers.ForResource) + + // Get the discovery client + dcl, err := discovery.NewDiscoveryClientForConfig(clusterConfig) + if err != nil { + return nil, fmt.Errorf("failed to create discovery client: %w", err) + } + + // ServiceMonitor + serviceMonitorInformer, err := createInformerIfAvailable( + factory, dcl, "servicemonitors", + monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.ServiceMonitorName), + logger, + ) if err != nil { return nil, err } + if serviceMonitorInformer != nil { + informersMap[monitoringv1.ServiceMonitorName] = serviceMonitorInformer + } - podMonitorInformers, err := informers.NewInformersForResource(factory, monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.PodMonitorName)) + // PodMonitor + podMonitorInformer, err := createInformerIfAvailable( + factory, dcl, "podmonitors", + monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.PodMonitorName), + logger, + ) if err != nil { return nil, err } + if podMonitorInformer != nil { + informersMap[monitoringv1.PodMonitorName] = podMonitorInformer + } - probeInformers, err := informers.NewInformersForResource(factory, monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.ProbeName)) + // Probe + probeInformer, err := createInformerIfAvailable( + factory, dcl, "probes", + monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.ProbeName), + logger, + ) if err != nil { return nil, err } + if probeInformer != nil { + informersMap[monitoringv1.ProbeName] = probeInformer + } - scrapeConfigInformers, err := informers.NewInformersForResource(factory, promv1alpha1.SchemeGroupVersion.WithResource(promv1alpha1.ScrapeConfigName)) + // ScrapeConfig + scrapeConfigInformer, err := createInformerIfAvailable( + factory, dcl, "scrapeconfigs", + promv1alpha1.SchemeGroupVersion.WithResource(promv1alpha1.ScrapeConfigName), + logger, + ) if err != nil { return nil, err } + if scrapeConfigInformer != nil { + informersMap[promv1alpha1.ScrapeConfigName] = scrapeConfigInformer + } - return map[string]*informers.ForResource{ - monitoringv1.ServiceMonitorName: serviceMonitorInformers, - monitoringv1.PodMonitorName: podMonitorInformers, - monitoringv1.ProbeName: probeInformers, - promv1alpha1.ScrapeConfigName: scrapeConfigInformers, - }, nil + return informersMap, nil } // Watch wrapped informers and wait for an initial sync. @@ -264,7 +360,13 @@ func (w *PrometheusCRWatcher) Watch(upstreamEvents chan Event, upstreamErrors ch w.logger.Info("Unable to watch namespaces since namespace informer is nil") } + // Only attempt to sync informers that were actually created for name, resource := range w.informers { + if resource == nil { + w.logger.Info("Skipping nil informer", "informer", name) + continue + } + resource.Start(w.stopChannel) if ok := w.WaitForNamedCacheSync(name, resource.HasSynced); !ok { @@ -348,24 +450,46 @@ func (w *PrometheusCRWatcher) LoadConfig(ctx context.Context) (*promconfig.Confi promCfg := &promconfig.Config{} if w.resourceSelector != nil { - serviceMonitorInstances, err := w.resourceSelector.SelectServiceMonitors(ctx, w.informers[monitoringv1.ServiceMonitorName].ListAllByNamespace) - if err != nil { - return nil, err + // Initialize empty maps for all resource types + serviceMonitorInstances := make(map[string]*monitoringv1.ServiceMonitor) + podMonitorInstances := make(map[string]*monitoringv1.PodMonitor) + probeInstances := make(map[string]*monitoringv1.Probe) + scrapeConfigInstances := make(map[string]*promv1alpha1.ScrapeConfig) + + // Get ServiceMonitors if the informer exists + if informer, ok := w.informers[monitoringv1.ServiceMonitorName]; ok { + instances, err := w.resourceSelector.SelectServiceMonitors(ctx, informer.ListAllByNamespace) + if err != nil { + return nil, err + } + serviceMonitorInstances = instances } - podMonitorInstances, err := w.resourceSelector.SelectPodMonitors(ctx, w.informers[monitoringv1.PodMonitorName].ListAllByNamespace) - if err != nil { - return nil, err + // Get PodMonitors if the informer exists + if informer, ok := w.informers[monitoringv1.PodMonitorName]; ok { + instances, err := w.resourceSelector.SelectPodMonitors(ctx, informer.ListAllByNamespace) + if err != nil { + return nil, err + } + podMonitorInstances = instances } - probeInstances, err := w.resourceSelector.SelectProbes(ctx, w.informers[monitoringv1.ProbeName].ListAllByNamespace) - if err != nil { - return nil, err + // Get Probes if the informer exists + if informer, ok := w.informers[monitoringv1.ProbeName]; ok { + instances, err := w.resourceSelector.SelectProbes(ctx, informer.ListAllByNamespace) + if err != nil { + return nil, err + } + probeInstances = instances } - scrapeConfigInstances, err := w.resourceSelector.SelectScrapeConfigs(ctx, w.informers[promv1alpha1.ScrapeConfigName].ListAllByNamespace) - if err != nil { - return nil, err + // Get ScrapeConfigs if the informer exists + if informer, ok := w.informers[promv1alpha1.ScrapeConfigName]; ok { + instances, err := w.resourceSelector.SelectScrapeConfigs(ctx, informer.ListAllByNamespace) + if err != nil { + return nil, err + } + scrapeConfigInstances = instances } generatedConfig, err := w.configGenerator.GenerateServerConfiguration( diff --git a/cmd/otel-allocator/internal/watcher/promOperator_test.go b/cmd/otel-allocator/internal/watcher/promOperator_test.go index 7ce879cb11..bcd3706561 100644 --- a/cmd/otel-allocator/internal/watcher/promOperator_test.go +++ b/cmd/otel-allocator/internal/watcher/promOperator_test.go @@ -28,6 +28,7 @@ import ( "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + fakediscovery "k8s.io/client-go/discovery/fake" "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/tools/cache" fcache "k8s.io/client-go/tools/cache/testing" @@ -1220,7 +1221,8 @@ func getTestPrometheusCRWatcher( } factory := informers.NewMonitoringInformerFactories(map[string]struct{}{v1.NamespaceAll: {}}, map[string]struct{}{}, mClient, 0, nil) - informers, err := getInformers(factory) + + informers, err := getTestInformers(factory) if err != nil { t.Fatal(t, err) } @@ -1303,3 +1305,119 @@ func sanitizeScrapeConfigsForTest(scs []*promconfig.ScrapeConfig) { sc.MetricRelabelConfigs = nil } } + +// getTestInformers creates informers for testing without CRD availability checks. +func getTestInformers(factory informers.FactoriesForNamespaces) (map[string]*informers.ForResource, error) { + informersMap := make(map[string]*informers.ForResource) + + // Create ServiceMonitor informers + serviceMonitorInformers, err := informers.NewInformersForResource(factory, monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.ServiceMonitorName)) + if err != nil { + return nil, err + } + informersMap[monitoringv1.ServiceMonitorName] = serviceMonitorInformers + + // Create PodMonitor informers + podMonitorInformers, err := informers.NewInformersForResource(factory, monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.PodMonitorName)) + if err != nil { + return nil, err + } + informersMap[monitoringv1.PodMonitorName] = podMonitorInformers + + // Create Probe informers + probeInformers, err := informers.NewInformersForResource(factory, monitoringv1.SchemeGroupVersion.WithResource(monitoringv1.ProbeName)) + if err != nil { + return nil, err + } + informersMap[monitoringv1.ProbeName] = probeInformers + + // Create ScrapeConfig informers + scrapeConfigInformers, err := informers.NewInformersForResource(factory, promv1alpha1.SchemeGroupVersion.WithResource(promv1alpha1.ScrapeConfigName)) + if err != nil { + return nil, err + } + informersMap[promv1alpha1.ScrapeConfigName] = scrapeConfigInformers + + return informersMap, nil +} + +// TestCRDAvailabilityChecks tests the CRDs' availability. +func TestCRDAvailabilityChecks(t *testing.T) { + tests := []struct { + name string + availableCRDs []string + expectedCRDs []string + }{ + { + name: "ServiceMonitor available", + availableCRDs: []string{"servicemonitors"}, + expectedCRDs: []string{"servicemonitors"}, + }, + { + name: "All CRDs available", + availableCRDs: []string{"servicemonitors", "podmonitors", "probes", "scrapeconfigs"}, + expectedCRDs: []string{"servicemonitors", "podmonitors", "probes", "scrapeconfigs"}, + }, + { + name: "No CRDs available", + availableCRDs: []string{}, + expectedCRDs: []string{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create fake discovery client + fakeDiscovery := &fakediscovery.FakeDiscovery{ + Fake: &fake.NewSimpleClientset().Fake, + } + + // Set up resources + fakeDiscovery.Resources = []*metav1.APIResourceList{} + + // Add v1 resources + v1Resources := &metav1.APIResourceList{ + GroupVersion: "monitoring.coreos.com/v1", + APIResources: []metav1.APIResource{}, + } + for _, crd := range tt.availableCRDs { + if crd == "servicemonitors" || crd == "podmonitors" || crd == "probes" { + v1Resources.APIResources = append(v1Resources.APIResources, metav1.APIResource{ + Name: crd, + }) + } + } + fakeDiscovery.Resources = append(fakeDiscovery.Resources, v1Resources) + + // Add v1alpha1 resources + v1alpha1Resources := &metav1.APIResourceList{ + GroupVersion: "monitoring.coreos.com/v1alpha1", + APIResources: []metav1.APIResource{}, + } + for _, crd := range tt.availableCRDs { + if crd == "scrapeconfigs" { + v1alpha1Resources.APIResources = append(v1alpha1Resources.APIResources, metav1.APIResource{ + Name: crd, + }) + } + } + fakeDiscovery.Resources = append(fakeDiscovery.Resources, v1alpha1Resources) + + // Test each CRD availability + for _, crd := range []string{"servicemonitors", "podmonitors", "probes", "scrapeconfigs"} { + available, err := checkCRDAvailability(fakeDiscovery, crd) + require.NoError(t, err) + + expected := false + for _, expectedCRD := range tt.expectedCRDs { + if crd == expectedCRD { + expected = true + break + } + } + + assert.Equal(t, expected, available, "CRD %s availability should match expectation", crd) + } + }) + } +}