Skip to content

Commit f384271

Browse files
committed
feat: add debug logging to diagnose label sync issues [minor]
When investigating potential issues with node labels syncing to EC2 was no visibility into the controller's decision-making. The predicates silently filter events, and reconcile doesn't log which labels it finds vs expects. Without this, debugging requires guessing whether the predicate rejected the event or the label simply wasn't on the node. Adds V(1) debug logs throughout the sync pipeline. The predicate now logs every Create/Update event with `shouldProcess=true/false` so you can see if events are being filtered. Reconcile logs which monitored labels are missing from the node and the final set of tags being synced. Replaces the custom `--json` flag with controller-runtime's standard zap flags (`--zap-log-level`, `--zap-encoder`, etc). To see debug logs, run with `--zap-log-level=debug` or `--zap-log-level=1`. Additionally, introduce a new 4 hour resync period and update the predicate on Update events to allow a full resync to the cloud provider to catch any potentially missed label sync events. This is a trade-off between eventual consistency and the cost of the extra DescribeTags calls. > NOTE: Due to the dropping of `--json` flag and replacing with zap flags, > the minor version is being bumped to v0.1.0.
1 parent c5aa602 commit f384271

File tree

3 files changed

+238
-18
lines changed

3 files changed

+238
-18
lines changed

controller.go

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
awsconfig "github.com/aws/aws-sdk-go-v2/config"
1212
"github.com/aws/aws-sdk-go-v2/service/ec2"
1313
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
14+
"github.com/go-logr/logr"
1415
gce "google.golang.org/api/compute/v1"
1516
corev1 "k8s.io/api/core/v1"
1617
ctrl "sigs.k8s.io/controller-runtime"
@@ -21,6 +22,7 @@ import (
2122

2223
type NodeLabelController struct {
2324
client.Client
25+
Logger logr.Logger
2426
EC2Client ec2Client
2527
GCEClient gceClient
2628

@@ -67,15 +69,37 @@ func (r *NodeLabelController) SetupWithManager(mgr ctrl.Manager) error {
6769
if !ok {
6870
return false
6971
}
70-
return shouldProcessNodeUpdate(oldNode, newNode, r.Labels, r.Annotations)
72+
73+
// Process if any monitored label/annotation changed
74+
if shouldProcessNodeUpdate(oldNode, newNode, r.Labels, r.Annotations) {
75+
r.Logger.V(1).Info("Update event: label changed", "node", newNode.Name)
76+
return true
77+
}
78+
79+
// Also process if node has monitored labels (catches resync events).
80+
// During resync, old == new, so shouldProcessNodeUpdate returns false,
81+
// but we still want to reconcile to catch any missed events.
82+
if shouldProcessNodeCreate(newNode, r.Labels, r.Annotations) {
83+
r.Logger.V(1).Info("Update event: resync", "node", newNode.Name)
84+
return true
85+
}
86+
87+
return false
7188
},
7289

7390
CreateFunc: func(e event.CreateEvent) bool {
7491
node, ok := e.Object.(*corev1.Node)
7592
if !ok {
7693
return false
7794
}
78-
return shouldProcessNodeCreate(node, r.Labels, r.Annotations)
95+
shouldProcess := shouldProcessNodeCreate(node, r.Labels, r.Annotations)
96+
r.Logger.V(1).Info("Create event",
97+
"node", node.Name,
98+
"shouldProcess", shouldProcess,
99+
"labels", node.Labels,
100+
"monitoredLabels", r.Labels,
101+
)
102+
return shouldProcess
79103
},
80104

81105
DeleteFunc: func(e event.DeleteEvent) bool {
@@ -164,7 +188,7 @@ func shouldProcessNodeCreate(node *corev1.Node, monitoredLabels, monitoredAnnota
164188
}
165189

166190
func (r *NodeLabelController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
167-
logger := ctrl.Log.WithName("reconcile").WithValues("node", req.NamespacedName)
191+
logger := r.Logger.WithValues("node", req.NamespacedName)
168192

169193
var node corev1.Node
170194
if err := r.Get(ctx, req.NamespacedName, &node); err != nil {
@@ -186,6 +210,8 @@ func (r *NodeLabelController) Reconcile(ctx context.Context, req ctrl.Request) (
186210
for _, k := range r.Labels {
187211
if value, exists := node.Labels[k]; exists {
188212
tagsToSync[k] = value
213+
} else {
214+
logger.V(1).Info("Monitored label not found on node", "label", k)
189215
}
190216
}
191217
}
@@ -195,10 +221,14 @@ func (r *NodeLabelController) Reconcile(ctx context.Context, req ctrl.Request) (
195221
for _, k := range r.Annotations {
196222
if value, exists := node.Annotations[k]; exists {
197223
tagsToSync[k] = value
224+
} else {
225+
logger.V(1).Info("Monitored annotation not found on node", "annotation", k)
198226
}
199227
}
200228
}
201229

230+
logger.V(1).Info("Collected tags to sync", "tagsToSync", tagsToSync, "monitoredLabels", r.Labels, "monitoredAnnotations", r.Annotations)
231+
202232
var err error
203233
switch r.Cloud {
204234
case "aws":

controller_test.go

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"github.com/aws/aws-sdk-go-v2/aws"
99
"github.com/aws/aws-sdk-go-v2/service/ec2"
1010
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
11+
"github.com/go-logr/logr"
1112
"github.com/stretchr/testify/assert"
1213
"github.com/stretchr/testify/require"
1314
gce "google.golang.org/api/compute/v1"
@@ -184,6 +185,7 @@ func TestReconcileAWS(t *testing.T) {
184185

185186
r := &NodeLabelController{
186187
Client: k8s,
188+
Logger: logr.Discard(),
187189
Labels: tt.labelsToCopy,
188190
Annotations: tt.annotationsToCopy,
189191
Cloud: "aws",
@@ -315,6 +317,7 @@ func TestReconcileGCP(t *testing.T) {
315317

316318
r := &NodeLabelController{
317319
Client: k8s,
320+
Logger: logr.Discard(),
318321
Labels: tt.labelsToCopy,
319322
Annotations: tt.annotationsToCopy,
320323
Cloud: "gcp",
@@ -658,3 +661,184 @@ func createNode(config mockNode) *corev1.Node {
658661
},
659662
}
660663
}
664+
665+
// TestPredicateToReconcileFlow tests the full flow from event through predicate to reconcile.
666+
// This simulates what controller-runtime does: events are filtered by predicates, and only
667+
// if the predicate allows, reconcile is called.
668+
func TestPredicateToReconcileFlow(t *testing.T) {
669+
tests := []struct {
670+
name string
671+
monitoredLabels []string
672+
initialNode mockNode
673+
updatedNode *mockNode // nil means no update step
674+
expectReconcileOnCreate bool
675+
expectReconcileOnUpdate bool
676+
expectTagsCreated []string // tag keys we expect to be created
677+
}{
678+
{
679+
name: "node created without monitored labels then labels added",
680+
monitoredLabels: []string{"env", "team"},
681+
initialNode: mockNode{
682+
Name: "node1",
683+
Labels: map[string]string{"kubernetes.io/hostname": "node1"},
684+
ProviderID: "aws:///us-east-1a/i-1234567890abcdef0",
685+
},
686+
updatedNode: &mockNode{
687+
Name: "node1",
688+
Labels: map[string]string{"kubernetes.io/hostname": "node1", "env": "prod", "team": "platform"},
689+
ProviderID: "aws:///us-east-1a/i-1234567890abcdef0",
690+
},
691+
expectReconcileOnCreate: false, // no monitored labels yet
692+
expectReconcileOnUpdate: true, // monitored labels added
693+
expectTagsCreated: []string{"env", "team"},
694+
},
695+
{
696+
name: "node created with monitored labels already present",
697+
monitoredLabels: []string{"env"},
698+
initialNode: mockNode{
699+
Name: "node1",
700+
Labels: map[string]string{"env": "prod"},
701+
ProviderID: "aws:///us-east-1a/i-1234567890abcdef0",
702+
},
703+
updatedNode: nil,
704+
expectReconcileOnCreate: true,
705+
expectReconcileOnUpdate: false,
706+
expectTagsCreated: []string{"env"},
707+
},
708+
{
709+
name: "node created without labels then only some monitored labels added",
710+
monitoredLabels: []string{"env", "team", "region"},
711+
initialNode: mockNode{
712+
Name: "node1",
713+
Labels: map[string]string{},
714+
ProviderID: "aws:///us-east-1a/i-1234567890abcdef0",
715+
},
716+
updatedNode: &mockNode{
717+
Name: "node1",
718+
Labels: map[string]string{"env": "prod"}, // only env, not team or region
719+
ProviderID: "aws:///us-east-1a/i-1234567890abcdef0",
720+
},
721+
expectReconcileOnCreate: false,
722+
expectReconcileOnUpdate: true,
723+
expectTagsCreated: []string{"env"}, // only env should be synced
724+
},
725+
{
726+
name: "node update that does not change monitored labels triggers resync",
727+
monitoredLabels: []string{"env"},
728+
initialNode: mockNode{
729+
Name: "node1",
730+
Labels: map[string]string{"env": "prod"},
731+
ProviderID: "aws:///us-east-1a/i-1234567890abcdef0",
732+
},
733+
updatedNode: &mockNode{
734+
Name: "node1",
735+
Labels: map[string]string{"env": "prod", "unrelated": "change"},
736+
ProviderID: "aws:///us-east-1a/i-1234567890abcdef0",
737+
},
738+
expectReconcileOnCreate: true,
739+
expectReconcileOnUpdate: true, // resync: node has monitored labels
740+
expectTagsCreated: []string{"env"},
741+
},
742+
{
743+
name: "multiple labels added in single update",
744+
monitoredLabels: []string{"env", "team", "cost-center"},
745+
initialNode: mockNode{
746+
Name: "node1",
747+
Labels: map[string]string{},
748+
ProviderID: "aws:///us-east-1a/i-1234567890abcdef0",
749+
},
750+
updatedNode: &mockNode{
751+
Name: "node1",
752+
Labels: map[string]string{
753+
"env": "prod",
754+
"team": "platform",
755+
"cost-center": "12345",
756+
},
757+
ProviderID: "aws:///us-east-1a/i-1234567890abcdef0",
758+
},
759+
expectReconcileOnCreate: false,
760+
expectReconcileOnUpdate: true,
761+
expectTagsCreated: []string{"env", "team", "cost-center"},
762+
},
763+
}
764+
765+
for _, tt := range tests {
766+
t.Run(tt.name, func(t *testing.T) {
767+
scheme := runtime.NewScheme()
768+
require.NoError(t, corev1.AddToScheme(scheme))
769+
770+
// Start with initial node in the fake client
771+
k8s := fake.NewClientBuilder().
772+
WithScheme(scheme).
773+
WithObjects(createNode(tt.initialNode)).
774+
Build()
775+
776+
mock := &mockEC2Client{currentTags: []types.TagDescription{}}
777+
778+
controller := &NodeLabelController{
779+
Client: k8s,
780+
Logger: logr.Discard(),
781+
Labels: tt.monitoredLabels,
782+
Annotations: []string{},
783+
Cloud: "aws",
784+
EC2Client: mock,
785+
}
786+
787+
// Simulate CREATE event
788+
initialNodeObj := createNode(tt.initialNode)
789+
createAllowed := shouldProcessNodeCreate(initialNodeObj, tt.monitoredLabels, []string{})
790+
791+
assert.Equal(t, tt.expectReconcileOnCreate, createAllowed,
792+
"Create predicate returned unexpected result")
793+
794+
if createAllowed {
795+
_, err := controller.Reconcile(context.Background(), ctrl.Request{
796+
NamespacedName: client.ObjectKey{Name: tt.initialNode.Name},
797+
})
798+
require.NoError(t, err)
799+
800+
// Verify tags were created
801+
createdKeys := make([]string, 0, len(mock.createdTags))
802+
for _, tag := range mock.createdTags {
803+
createdKeys = append(createdKeys, aws.ToString(tag.Key))
804+
}
805+
assert.ElementsMatch(t, tt.expectTagsCreated, createdKeys,
806+
"Created tags don't match expected")
807+
}
808+
809+
// Simulate UPDATE event if provided
810+
if tt.updatedNode != nil {
811+
// Reset mock for update test
812+
mock.createdTags = nil
813+
mock.currentTags = []types.TagDescription{} // EC2 has no tags yet (simulating missed create)
814+
815+
// Update the node in the fake client
816+
updatedNodeObj := createNode(*tt.updatedNode)
817+
err := k8s.Update(context.Background(), updatedNodeObj)
818+
require.NoError(t, err)
819+
820+
// Match the actual predicate logic: allow if labels changed OR if node has monitored labels (resync)
821+
updateAllowed := shouldProcessNodeUpdate(initialNodeObj, updatedNodeObj, tt.monitoredLabels, []string{}) ||
822+
shouldProcessNodeCreate(updatedNodeObj, tt.monitoredLabels, []string{})
823+
824+
assert.Equal(t, tt.expectReconcileOnUpdate, updateAllowed,
825+
"Update predicate returned unexpected result")
826+
827+
if updateAllowed {
828+
_, err := controller.Reconcile(context.Background(), ctrl.Request{
829+
NamespacedName: client.ObjectKey{Name: tt.updatedNode.Name},
830+
})
831+
require.NoError(t, err)
832+
833+
// Verify tags were created
834+
createdKeys := make([]string, 0, len(mock.createdTags))
835+
for _, tag := range mock.createdTags {
836+
createdKeys = append(createdKeys, aws.ToString(tag.Key))
837+
}
838+
assert.ElementsMatch(t, tt.expectTagsCreated, createdKeys,
839+
"Created tags on update don't match expected")
840+
}
841+
}
842+
})
843+
}
844+
}

main.go

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,22 @@ import (
66
"fmt"
77
"os"
88
"strings"
9+
"time"
910

1011
"k8s.io/apimachinery/pkg/runtime"
1112
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
13+
"k8s.io/utils/ptr"
1214
ctrl "sigs.k8s.io/controller-runtime"
15+
"sigs.k8s.io/controller-runtime/pkg/cache"
1316
"sigs.k8s.io/controller-runtime/pkg/healthz"
1417
"sigs.k8s.io/controller-runtime/pkg/log/zap"
1518
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
1619
)
1720

18-
const leaderElectionId = "node-label-controller"
21+
const (
22+
leaderElectionID = "node-label-controller"
23+
cacheSyncPeriod = 4 * time.Hour
24+
)
1925

2026
func main() {
2127
var probesAddr string
@@ -25,9 +31,6 @@ func main() {
2531
var labelsStr string
2632
var annotationsStr string
2733
var cloudProvider string
28-
var jsonLogs bool
29-
30-
logger := ctrl.Log.WithName("main")
3134

3235
flag.StringVar(&probesAddr, "probes-addr", ":8080", "The address the /readyz and /healthz probes endpoint binds to.")
3336
flag.StringVar(&metricsAddr, "metrics-addr", ":8081", "The address the metric endpoint binds to.")
@@ -36,29 +39,28 @@ func main() {
3639
flag.StringVar(&labelsStr, "labels", "", "Comma-separated list of label keys to sync")
3740
flag.StringVar(&annotationsStr, "annotations", "", "Comma-separated list of annotation keys to sync")
3841
flag.StringVar(&cloudProvider, "cloud", "", "Cloud provider (aws or gcp)")
39-
flag.BoolVar(&jsonLogs, "json", false, "Output logs in JSON format")
42+
43+
// Add zap logger flags (--zap-log-level, --zap-stacktrace-level, --zap-encoder, --zap-devel)
44+
zapOpts := zap.Options{}
45+
zapOpts.BindFlags(flag.CommandLine)
46+
4047
flag.Parse()
4148

42-
// setup logger. Use development mode by default or json output if --json is set
43-
var opts []zap.Opts
44-
opts = append(opts, zap.UseDevMode(!jsonLogs))
45-
if jsonLogs {
46-
opts = append(opts, zap.JSONEncoder())
47-
}
48-
ctrl.SetLogger(zap.New(opts...))
49+
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&zapOpts)))
50+
logger := ctrl.Log.WithName("main")
4951

5052
// validate flags
5153
if labelsStr == "" && annotationsStr == "" {
5254
logger.Error(fmt.Errorf("either --labels or --annotations is required"), "unable to start manager")
5355
os.Exit(1)
5456
}
55-
57+
5658
var labels []string
5759
if labelsStr != "" {
5860
labels = strings.Split(labelsStr, ",")
5961
logger.Info("Label keys to sync", "labelKeys", labels)
6062
}
61-
63+
6264
var annotations []string
6365
if annotationsStr != "" {
6466
annotations = strings.Split(annotationsStr, ",")
@@ -89,7 +91,10 @@ func main() {
8991
},
9092
PprofBindAddress: pprofAddr,
9193
LeaderElection: enableLeaderElection,
92-
LeaderElectionID: leaderElectionId,
94+
LeaderElectionID: leaderElectionID,
95+
Cache: cache.Options{
96+
SyncPeriod: ptr.To(cacheSyncPeriod),
97+
},
9398
})
9499
if err != nil {
95100
logger.Error(err, "unable to start manager")
@@ -111,6 +116,7 @@ func main() {
111116
// setup our controller and start it
112117
controller := &NodeLabelController{
113118
Client: mgr.GetClient(),
119+
Logger: ctrl.Log.WithName("controller"),
114120
Labels: labels,
115121
Annotations: annotations,
116122
Cloud: cloudProvider,

0 commit comments

Comments
 (0)