From f3842714c7cc97bbf24ef987eba7ba65fef87052 Mon Sep 17 00:00:00 2001 From: joe miller Date: Tue, 9 Dec 2025 16:28:05 +0000 Subject: [PATCH 1/3] feat: add debug logging to diagnose label sync issues [minor] When investigating potential issues with node labels syncing to EC2 was no visibility into the controller's decision-making. The predicates silently filter events, and reconcile doesn't log which labels it finds vs expects. Without this, debugging requires guessing whether the predicate rejected the event or the label simply wasn't on the node. Adds V(1) debug logs throughout the sync pipeline. The predicate now logs every Create/Update event with `shouldProcess=true/false` so you can see if events are being filtered. Reconcile logs which monitored labels are missing from the node and the final set of tags being synced. Replaces the custom `--json` flag with controller-runtime's standard zap flags (`--zap-log-level`, `--zap-encoder`, etc). To see debug logs, run with `--zap-log-level=debug` or `--zap-log-level=1`. Additionally, introduce a new 4 hour resync period and update the predicate on Update events to allow a full resync to the cloud provider to catch any potentially missed label sync events. This is a trade-off between eventual consistency and the cost of the extra DescribeTags calls. > NOTE: Due to the dropping of `--json` flag and replacing with zap flags, > the minor version is being bumped to v0.1.0. --- controller.go | 36 ++++++++- controller_test.go | 184 +++++++++++++++++++++++++++++++++++++++++++++ main.go | 36 +++++---- 3 files changed, 238 insertions(+), 18 deletions(-) diff --git a/controller.go b/controller.go index bdcca9f..1304a8b 100644 --- a/controller.go +++ b/controller.go @@ -11,6 +11,7 @@ import ( awsconfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/ec2" "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/go-logr/logr" gce "google.golang.org/api/compute/v1" corev1 "k8s.io/api/core/v1" ctrl "sigs.k8s.io/controller-runtime" @@ -21,6 +22,7 @@ import ( type NodeLabelController struct { client.Client + Logger logr.Logger EC2Client ec2Client GCEClient gceClient @@ -67,7 +69,22 @@ func (r *NodeLabelController) SetupWithManager(mgr ctrl.Manager) error { if !ok { return false } - return shouldProcessNodeUpdate(oldNode, newNode, r.Labels, r.Annotations) + + // Process if any monitored label/annotation changed + if shouldProcessNodeUpdate(oldNode, newNode, r.Labels, r.Annotations) { + r.Logger.V(1).Info("Update event: label changed", "node", newNode.Name) + return true + } + + // Also process if node has monitored labels (catches resync events). + // During resync, old == new, so shouldProcessNodeUpdate returns false, + // but we still want to reconcile to catch any missed events. + if shouldProcessNodeCreate(newNode, r.Labels, r.Annotations) { + r.Logger.V(1).Info("Update event: resync", "node", newNode.Name) + return true + } + + return false }, CreateFunc: func(e event.CreateEvent) bool { @@ -75,7 +92,14 @@ func (r *NodeLabelController) SetupWithManager(mgr ctrl.Manager) error { if !ok { return false } - return shouldProcessNodeCreate(node, r.Labels, r.Annotations) + shouldProcess := shouldProcessNodeCreate(node, r.Labels, r.Annotations) + r.Logger.V(1).Info("Create event", + "node", node.Name, + "shouldProcess", shouldProcess, + "labels", node.Labels, + "monitoredLabels", r.Labels, + ) + return shouldProcess }, DeleteFunc: func(e event.DeleteEvent) bool { @@ -164,7 +188,7 @@ func shouldProcessNodeCreate(node *corev1.Node, monitoredLabels, monitoredAnnota } func (r *NodeLabelController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - logger := ctrl.Log.WithName("reconcile").WithValues("node", req.NamespacedName) + logger := r.Logger.WithValues("node", req.NamespacedName) var node corev1.Node if err := r.Get(ctx, req.NamespacedName, &node); err != nil { @@ -186,6 +210,8 @@ func (r *NodeLabelController) Reconcile(ctx context.Context, req ctrl.Request) ( for _, k := range r.Labels { if value, exists := node.Labels[k]; exists { tagsToSync[k] = value + } else { + logger.V(1).Info("Monitored label not found on node", "label", k) } } } @@ -195,10 +221,14 @@ func (r *NodeLabelController) Reconcile(ctx context.Context, req ctrl.Request) ( for _, k := range r.Annotations { if value, exists := node.Annotations[k]; exists { tagsToSync[k] = value + } else { + logger.V(1).Info("Monitored annotation not found on node", "annotation", k) } } } + logger.V(1).Info("Collected tags to sync", "tagsToSync", tagsToSync, "monitoredLabels", r.Labels, "monitoredAnnotations", r.Annotations) + var err error switch r.Cloud { case "aws": diff --git a/controller_test.go b/controller_test.go index 09e1f0f..608de62 100644 --- a/controller_test.go +++ b/controller_test.go @@ -8,6 +8,7 @@ import ( "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/ec2" "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/go-logr/logr" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" gce "google.golang.org/api/compute/v1" @@ -184,6 +185,7 @@ func TestReconcileAWS(t *testing.T) { r := &NodeLabelController{ Client: k8s, + Logger: logr.Discard(), Labels: tt.labelsToCopy, Annotations: tt.annotationsToCopy, Cloud: "aws", @@ -315,6 +317,7 @@ func TestReconcileGCP(t *testing.T) { r := &NodeLabelController{ Client: k8s, + Logger: logr.Discard(), Labels: tt.labelsToCopy, Annotations: tt.annotationsToCopy, Cloud: "gcp", @@ -658,3 +661,184 @@ func createNode(config mockNode) *corev1.Node { }, } } + +// TestPredicateToReconcileFlow tests the full flow from event through predicate to reconcile. +// This simulates what controller-runtime does: events are filtered by predicates, and only +// if the predicate allows, reconcile is called. +func TestPredicateToReconcileFlow(t *testing.T) { + tests := []struct { + name string + monitoredLabels []string + initialNode mockNode + updatedNode *mockNode // nil means no update step + expectReconcileOnCreate bool + expectReconcileOnUpdate bool + expectTagsCreated []string // tag keys we expect to be created + }{ + { + name: "node created without monitored labels then labels added", + monitoredLabels: []string{"env", "team"}, + initialNode: mockNode{ + Name: "node1", + Labels: map[string]string{"kubernetes.io/hostname": "node1"}, + ProviderID: "aws:///us-east-1a/i-1234567890abcdef0", + }, + updatedNode: &mockNode{ + Name: "node1", + Labels: map[string]string{"kubernetes.io/hostname": "node1", "env": "prod", "team": "platform"}, + ProviderID: "aws:///us-east-1a/i-1234567890abcdef0", + }, + expectReconcileOnCreate: false, // no monitored labels yet + expectReconcileOnUpdate: true, // monitored labels added + expectTagsCreated: []string{"env", "team"}, + }, + { + name: "node created with monitored labels already present", + monitoredLabels: []string{"env"}, + initialNode: mockNode{ + Name: "node1", + Labels: map[string]string{"env": "prod"}, + ProviderID: "aws:///us-east-1a/i-1234567890abcdef0", + }, + updatedNode: nil, + expectReconcileOnCreate: true, + expectReconcileOnUpdate: false, + expectTagsCreated: []string{"env"}, + }, + { + name: "node created without labels then only some monitored labels added", + monitoredLabels: []string{"env", "team", "region"}, + initialNode: mockNode{ + Name: "node1", + Labels: map[string]string{}, + ProviderID: "aws:///us-east-1a/i-1234567890abcdef0", + }, + updatedNode: &mockNode{ + Name: "node1", + Labels: map[string]string{"env": "prod"}, // only env, not team or region + ProviderID: "aws:///us-east-1a/i-1234567890abcdef0", + }, + expectReconcileOnCreate: false, + expectReconcileOnUpdate: true, + expectTagsCreated: []string{"env"}, // only env should be synced + }, + { + name: "node update that does not change monitored labels triggers resync", + monitoredLabels: []string{"env"}, + initialNode: mockNode{ + Name: "node1", + Labels: map[string]string{"env": "prod"}, + ProviderID: "aws:///us-east-1a/i-1234567890abcdef0", + }, + updatedNode: &mockNode{ + Name: "node1", + Labels: map[string]string{"env": "prod", "unrelated": "change"}, + ProviderID: "aws:///us-east-1a/i-1234567890abcdef0", + }, + expectReconcileOnCreate: true, + expectReconcileOnUpdate: true, // resync: node has monitored labels + expectTagsCreated: []string{"env"}, + }, + { + name: "multiple labels added in single update", + monitoredLabels: []string{"env", "team", "cost-center"}, + initialNode: mockNode{ + Name: "node1", + Labels: map[string]string{}, + ProviderID: "aws:///us-east-1a/i-1234567890abcdef0", + }, + updatedNode: &mockNode{ + Name: "node1", + Labels: map[string]string{ + "env": "prod", + "team": "platform", + "cost-center": "12345", + }, + ProviderID: "aws:///us-east-1a/i-1234567890abcdef0", + }, + expectReconcileOnCreate: false, + expectReconcileOnUpdate: true, + expectTagsCreated: []string{"env", "team", "cost-center"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(scheme)) + + // Start with initial node in the fake client + k8s := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(createNode(tt.initialNode)). + Build() + + mock := &mockEC2Client{currentTags: []types.TagDescription{}} + + controller := &NodeLabelController{ + Client: k8s, + Logger: logr.Discard(), + Labels: tt.monitoredLabels, + Annotations: []string{}, + Cloud: "aws", + EC2Client: mock, + } + + // Simulate CREATE event + initialNodeObj := createNode(tt.initialNode) + createAllowed := shouldProcessNodeCreate(initialNodeObj, tt.monitoredLabels, []string{}) + + assert.Equal(t, tt.expectReconcileOnCreate, createAllowed, + "Create predicate returned unexpected result") + + if createAllowed { + _, err := controller.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKey{Name: tt.initialNode.Name}, + }) + require.NoError(t, err) + + // Verify tags were created + createdKeys := make([]string, 0, len(mock.createdTags)) + for _, tag := range mock.createdTags { + createdKeys = append(createdKeys, aws.ToString(tag.Key)) + } + assert.ElementsMatch(t, tt.expectTagsCreated, createdKeys, + "Created tags don't match expected") + } + + // Simulate UPDATE event if provided + if tt.updatedNode != nil { + // Reset mock for update test + mock.createdTags = nil + mock.currentTags = []types.TagDescription{} // EC2 has no tags yet (simulating missed create) + + // Update the node in the fake client + updatedNodeObj := createNode(*tt.updatedNode) + err := k8s.Update(context.Background(), updatedNodeObj) + require.NoError(t, err) + + // Match the actual predicate logic: allow if labels changed OR if node has monitored labels (resync) + updateAllowed := shouldProcessNodeUpdate(initialNodeObj, updatedNodeObj, tt.monitoredLabels, []string{}) || + shouldProcessNodeCreate(updatedNodeObj, tt.monitoredLabels, []string{}) + + assert.Equal(t, tt.expectReconcileOnUpdate, updateAllowed, + "Update predicate returned unexpected result") + + if updateAllowed { + _, err := controller.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKey{Name: tt.updatedNode.Name}, + }) + require.NoError(t, err) + + // Verify tags were created + createdKeys := make([]string, 0, len(mock.createdTags)) + for _, tag := range mock.createdTags { + createdKeys = append(createdKeys, aws.ToString(tag.Key)) + } + assert.ElementsMatch(t, tt.expectTagsCreated, createdKeys, + "Created tags on update don't match expected") + } + } + }) + } +} diff --git a/main.go b/main.go index 1425a46..3bd6831 100644 --- a/main.go +++ b/main.go @@ -6,16 +6,22 @@ import ( "fmt" "os" "strings" + "time" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" ) -const leaderElectionId = "node-label-controller" +const ( + leaderElectionID = "node-label-controller" + cacheSyncPeriod = 4 * time.Hour +) func main() { var probesAddr string @@ -25,9 +31,6 @@ func main() { var labelsStr string var annotationsStr string var cloudProvider string - var jsonLogs bool - - logger := ctrl.Log.WithName("main") flag.StringVar(&probesAddr, "probes-addr", ":8080", "The address the /readyz and /healthz probes endpoint binds to.") flag.StringVar(&metricsAddr, "metrics-addr", ":8081", "The address the metric endpoint binds to.") @@ -36,29 +39,28 @@ func main() { flag.StringVar(&labelsStr, "labels", "", "Comma-separated list of label keys to sync") flag.StringVar(&annotationsStr, "annotations", "", "Comma-separated list of annotation keys to sync") flag.StringVar(&cloudProvider, "cloud", "", "Cloud provider (aws or gcp)") - flag.BoolVar(&jsonLogs, "json", false, "Output logs in JSON format") + + // Add zap logger flags (--zap-log-level, --zap-stacktrace-level, --zap-encoder, --zap-devel) + zapOpts := zap.Options{} + zapOpts.BindFlags(flag.CommandLine) + flag.Parse() - // setup logger. Use development mode by default or json output if --json is set - var opts []zap.Opts - opts = append(opts, zap.UseDevMode(!jsonLogs)) - if jsonLogs { - opts = append(opts, zap.JSONEncoder()) - } - ctrl.SetLogger(zap.New(opts...)) + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&zapOpts))) + logger := ctrl.Log.WithName("main") // validate flags if labelsStr == "" && annotationsStr == "" { logger.Error(fmt.Errorf("either --labels or --annotations is required"), "unable to start manager") os.Exit(1) } - + var labels []string if labelsStr != "" { labels = strings.Split(labelsStr, ",") logger.Info("Label keys to sync", "labelKeys", labels) } - + var annotations []string if annotationsStr != "" { annotations = strings.Split(annotationsStr, ",") @@ -89,7 +91,10 @@ func main() { }, PprofBindAddress: pprofAddr, LeaderElection: enableLeaderElection, - LeaderElectionID: leaderElectionId, + LeaderElectionID: leaderElectionID, + Cache: cache.Options{ + SyncPeriod: ptr.To(cacheSyncPeriod), + }, }) if err != nil { logger.Error(err, "unable to start manager") @@ -111,6 +116,7 @@ func main() { // setup our controller and start it controller := &NodeLabelController{ Client: mgr.GetClient(), + Logger: ctrl.Log.WithName("controller"), Labels: labels, Annotations: annotations, Cloud: cloudProvider, From 65e01daa18b31b22cca9edd252c4d1252f64c27b Mon Sep 17 00:00:00 2001 From: joe miller Date: Tue, 9 Dec 2025 17:28:07 +0000 Subject: [PATCH 2/3] also log cloud provider node ID to help with debugging --- controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controller.go b/controller.go index 1304a8b..5b1fa15 100644 --- a/controller.go +++ b/controller.go @@ -238,11 +238,11 @@ func (r *NodeLabelController) Reconcile(ctx context.Context, req ctrl.Request) ( } if err != nil { - logger.Error(err, "failed to sync tags") + logger.Error(err, "failed to sync tags", "providerID", providerID) return ctrl.Result{}, err } - logger.Info("Successfully synced tags to cloud provider", "tags", tagsToSync) + logger.Info("Successfully synced tags to cloud provider", "providerID", providerID, "tags", tagsToSync) return ctrl.Result{}, nil } From c0662044a134295ea9247f62d1ef6e5b26295877 Mon Sep 17 00:00:00 2001 From: joe miller Date: Tue, 9 Dec 2025 19:40:59 +0000 Subject: [PATCH 3/3] accumulate missing labels and annotations in debug logs instead of logging them individually --- controller.go | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/controller.go b/controller.go index 5b1fa15..5cbe39e 100644 --- a/controller.go +++ b/controller.go @@ -204,30 +204,31 @@ func (r *NodeLabelController) Reconcile(ctx context.Context, req ctrl.Request) ( // Create a map for tags to sync with the cloud provider tagsToSync := make(map[string]string) + var notFoundLabels, notFoundAnnotations []string // First collect labels (may be overwritten by annotations with same key) - if node.Labels != nil { - for _, k := range r.Labels { - if value, exists := node.Labels[k]; exists { - tagsToSync[k] = value - } else { - logger.V(1).Info("Monitored label not found on node", "label", k) - } + for _, k := range r.Labels { + if value, exists := node.Labels[k]; exists { + tagsToSync[k] = value + } else { + notFoundLabels = append(notFoundLabels, k) } } // Then collect annotations (will overwrite labels with same key) - if node.Annotations != nil { - for _, k := range r.Annotations { - if value, exists := node.Annotations[k]; exists { - tagsToSync[k] = value - } else { - logger.V(1).Info("Monitored annotation not found on node", "annotation", k) - } + for _, k := range r.Annotations { + if value, exists := node.Annotations[k]; exists { + tagsToSync[k] = value + } else { + notFoundAnnotations = append(notFoundAnnotations, k) } } - logger.V(1).Info("Collected tags to sync", "tagsToSync", tagsToSync, "monitoredLabels", r.Labels, "monitoredAnnotations", r.Annotations) + logger.V(1).Info("Collected tags to sync", + "tagsToSync", tagsToSync, + "notFoundLabels", notFoundLabels, + "notFoundAnnotations", notFoundAnnotations, + ) var err error switch r.Cloud {