Skip to content

Commit 0a2495a

Browse files
committed
Fix issue where by metrics _could_ be registered *after* the pod has been deleted
1 parent 800f1b2 commit 0a2495a

File tree

6 files changed

+151
-17
lines changed

6 files changed

+151
-17
lines changed

cmd/app/app.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func NewCommand(ctx context.Context) *cobra.Command {
9898
log.Fatal("Unable to set up ready check:", err)
9999
}
100100

101-
metricsServer := metrics.New(log, ctrmetrics.Registry)
101+
metricsServer := metrics.New(log, ctrmetrics.Registry, mgr.GetCache())
102102

103103
opts.Client.Transport = transport.Chain(
104104
cleanhttp.DefaultTransport(),

pkg/controller/pod_controller_test.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,11 @@ func init() {
3434

3535
func TestNewController(t *testing.T) {
3636
kubeClient := fake.NewFakeClient()
37-
metrics := &metrics.Metrics{}
37+
metrics := metrics.New(
38+
logrus.NewEntry(logrus.StandardLogger()),
39+
prometheus.NewRegistry(),
40+
kubeClient,
41+
)
3842
imageClient := &client.Client{}
3943

4044
controller := NewPodReconciler(5*time.Minute, metrics, imageClient, kubeClient, testLogger, true)
@@ -78,6 +82,7 @@ func TestReconcile(t *testing.T) {
7882
metrics := metrics.New(
7983
logrus.NewEntry(logrus.StandardLogger()),
8084
prometheus.NewRegistry(),
85+
kubeClient,
8186
)
8287
controller := NewPodReconciler(5*time.Minute, metrics, imageClient, kubeClient, testLogger, true)
8388
controller.RequeueDuration = 5 * time.Minute
@@ -110,7 +115,11 @@ func TestReconcile(t *testing.T) {
110115
}
111116
func TestSetupWithManager(t *testing.T) {
112117
kubeClient := fake.NewClientBuilder().Build()
113-
metrics := &metrics.Metrics{}
118+
metrics := metrics.New(
119+
logrus.NewEntry(logrus.StandardLogger()),
120+
prometheus.NewRegistry(),
121+
kubeClient,
122+
)
114123
imageClient := &client.Client{}
115124
controller := NewPodReconciler(5*time.Minute, metrics, imageClient, kubeClient, testLogger, true)
116125

pkg/controller/pod_sync_test.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/stretchr/testify/assert"
1010
corev1 "k8s.io/api/core/v1"
1111
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
12+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
1213

1314
"github.com/prometheus/client_golang/prometheus"
1415

@@ -24,8 +25,13 @@ import (
2425
// Test for the sync method.
2526
func TestController_Sync(t *testing.T) {
2627
t.Parallel()
28+
2729
log := logrus.NewEntry(logrus.New())
28-
metrics := metrics.New(log, prometheus.NewRegistry())
30+
metrics := metrics.New(
31+
logrus.NewEntry(logrus.StandardLogger()),
32+
prometheus.NewRegistry(),
33+
fake.NewFakeClient(),
34+
)
2935
imageClient := &client.Client{}
3036
searcher := search.New(log, 5*time.Minute, version.New(log, imageClient, 5*time.Minute))
3137
checker := checker.New(searcher)
@@ -60,7 +66,7 @@ func TestController_Sync(t *testing.T) {
6066
func TestController_SyncContainer(t *testing.T) {
6167
t.Parallel()
6268
log := logrus.NewEntry(logrus.New())
63-
metrics := metrics.New(log, prometheus.NewRegistry())
69+
metrics := metrics.New(log, prometheus.NewRegistry(), fake.NewFakeClient())
6470
imageClient := &client.Client{}
6571
searcher := search.New(log, 5*time.Minute, version.New(log, imageClient, 5*time.Minute))
6672
checker := checker.New(searcher)
@@ -92,7 +98,7 @@ func TestController_SyncContainer(t *testing.T) {
9298
func TestController_CheckContainer(t *testing.T) {
9399
t.Parallel()
94100
log := logrus.NewEntry(logrus.New())
95-
metrics := metrics.New(log, prometheus.NewRegistry())
101+
metrics := metrics.New(log, prometheus.NewRegistry(), fake.NewFakeClient())
96102
imageClient := &client.Client{}
97103
searcher := search.New(log, 5*time.Minute, version.New(log, imageClient, 5*time.Minute))
98104
checker := checker.New(searcher)
@@ -122,7 +128,7 @@ func TestController_SyncContainer_NoVersionFound(t *testing.T) {
122128
t.Parallel()
123129

124130
log := logrus.NewEntry(logrus.New())
125-
metrics := metrics.New(log, prometheus.NewRegistry())
131+
metrics := metrics.New(log, prometheus.NewRegistry(), fake.NewFakeClient())
126132
imageClient := &client.Client{}
127133
searcher := search.New(log, 5*time.Minute, version.New(log, imageClient, 5*time.Minute))
128134
checker := checker.New(searcher)

pkg/metrics/metrics.go

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
package metrics
22

33
import (
4+
"context"
45
"sync"
56
"time"
67

78
"github.com/sirupsen/logrus"
89

10+
corev1 "k8s.io/api/core/v1"
11+
"k8s.io/apimachinery/pkg/types"
12+
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
13+
914
"github.com/prometheus/client_golang/prometheus"
1015
"github.com/prometheus/client_golang/prometheus/collectors"
1116
"github.com/prometheus/client_golang/prometheus/promauto"
@@ -23,13 +28,16 @@ type Metrics struct {
2328
containerImageDuration *prometheus.GaugeVec
2429
containerImageErrors *prometheus.CounterVec
2530

31+
cache k8sclient.Reader
32+
2633
// Contains all metrics for the roundtripper
2734
roundTripper *RoundTripper
2835

2936
mu sync.Mutex
3037
}
3138

32-
func New(log *logrus.Entry, reg ctrmetrics.RegistererGatherer) *Metrics {
39+
// func New(log *logrus.Entry, reg ctrmetrics.RegistererGatherer, kubeClient k8sclient.Client) *Metrics {
40+
func New(log *logrus.Entry, reg ctrmetrics.RegistererGatherer, cache k8sclient.Reader) *Metrics {
3341
// Attempt to register, but ignore errors
3442
_ = reg.Register(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
3543
_ = reg.Register(collectors.NewGoCollector())
@@ -74,7 +82,9 @@ func New(log *logrus.Entry, reg ctrmetrics.RegistererGatherer) *Metrics {
7482
)
7583

7684
return &Metrics{
77-
log: log.WithField("module", "metrics"),
85+
log: log.WithField("module", "metrics"),
86+
cache: cache,
87+
7888
registry: reg,
7989
containerImageVersion: containerImageVersion,
8090
containerImageDuration: containerImageDuration,
@@ -149,6 +159,11 @@ func (m *Metrics) RegisterImageDuration(namespace, pod, container, image string,
149159
m.mu.Lock()
150160
defer m.mu.Unlock()
151161

162+
if !m.PodExists(context.Background(), namespace, pod) {
163+
m.log.WithField("metric", "RegisterImageDuration").Warnf("pod %s/%s not found, not registering error", namespace, pod)
164+
return
165+
}
166+
152167
m.containerImageDuration.WithLabelValues(
153168
namespace, pod, container, image,
154169
).Set(time.Since(startTime).Seconds())
@@ -158,6 +173,11 @@ func (m *Metrics) ReportError(namespace, pod, container, imageURL string) {
158173
m.mu.Lock()
159174
defer m.mu.Unlock()
160175

176+
if !m.PodExists(context.Background(), namespace, pod) {
177+
m.log.WithField("metric", "ReportError").Warnf("pod %s/%s not found, not registering error", namespace, pod)
178+
return
179+
}
180+
161181
m.containerImageErrors.WithLabelValues(
162182
namespace, pod, container, imageURL,
163183
).Inc()
@@ -191,3 +211,10 @@ func (m *Metrics) buildPartialLabels(namespace, pod string) prometheus.Labels {
191211
"pod": pod,
192212
}
193213
}
214+
215+
// This _should_ leverage the Controllers Cache
216+
func (m *Metrics) PodExists(ctx context.Context, ns, name string) bool {
217+
pod := &corev1.Pod{}
218+
err := m.cache.Get(ctx, types.NamespacedName{Name: name, Namespace: ns}, pod)
219+
return err == nil && pod.GetDeletionTimestamp() == nil
220+
}

pkg/metrics/metrics_test.go

Lines changed: 99 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package metrics
22

33
import (
4+
"context"
45
"fmt"
56
"testing"
67
"time"
@@ -11,10 +12,26 @@ import (
1112

1213
"github.com/stretchr/testify/assert"
1314
"github.com/stretchr/testify/require"
15+
16+
"github.com/sirupsen/logrus"
17+
"github.com/stretchr/testify/assert"
18+
"github.com/stretchr/testify/require"
19+
20+
"k8s.io/apimachinery/pkg/runtime"
21+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
22+
23+
"github.com/prometheus/client_golang/prometheus"
24+
"github.com/prometheus/client_golang/prometheus/testutil"
25+
26+
corev1 "k8s.io/api/core/v1"
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
types "k8s.io/apimachinery/pkg/types"
1429
)
1530

31+
var fakek8s = fake.NewFakeClient()
32+
1633
func TestCache(t *testing.T) {
17-
m := New(logrus.NewEntry(logrus.New()), prometheus.NewRegistry())
34+
m := New(logrus.NewEntry(logrus.New()), prometheus.NewRegistry(), fakek8s)
1835

1936
// Lets add some Images/Metrics...
2037
for i, typ := range []string{"init", "container"} {
@@ -25,8 +42,9 @@ func TestCache(t *testing.T) {
2542
// Check and ensure that the metrics are available...
2643
for i, typ := range []string{"init", "container"} {
2744
version := fmt.Sprintf("0.1.%d", i)
28-
mt, err := m.containerImageVersion.GetMetricWith(m.buildFullLabels("namespace", "pod", "container", typ, "url", version, version))
29-
require.NoError(t, err)
45+
mt, _ := m.containerImageVersion.GetMetricWith(
46+
m.buildFullLabels("namespace", "pod", "container", typ, "url", version, version),
47+
)
3048
count := testutil.ToFloat64(mt)
3149
assert.Equal(t, count, float64(1), "Expected to get a metric for containerImageVersion")
3250
}
@@ -46,10 +64,11 @@ func TestCache(t *testing.T) {
4664
// Ensure metrics and values return 0
4765
for i, typ := range []string{"init", "container"} {
4866
version := fmt.Sprintf("0.1.%d", i)
49-
mt, err := m.containerImageVersion.GetMetricWith(m.buildFullLabels("namespace", "pod", "container", typ, "url", version, version))
50-
require.NoError(t, err)
67+
mt, _ := m.containerImageVersion.GetMetricWith(
68+
m.buildFullLabels("namespace", "pod", "container", typ, "url", version, version),
69+
)
5170
count := testutil.ToFloat64(mt)
52-
assert.Equal(t, count, float64(0), "Expected to get a metric for containerImageVersion")
71+
assert.Equal(t, count, float64(0), "Expected NOT to get a metric for containerImageVersion")
5372
}
5473
// And the Last Updated is removed too
5574
for _, typ := range []string{"init", "container"} {
@@ -62,7 +81,7 @@ func TestCache(t *testing.T) {
6281

6382
// TestErrorsReporting verifies that the error metric increments correctly
6483
func TestErrorsReporting(t *testing.T) {
65-
m := New(logrus.NewEntry(logrus.New()), prometheus.NewRegistry())
84+
m := New(logrus.NewEntry(logrus.New()), prometheus.NewRegistry(), fakek8s)
6685

6786
// Reset the metrics before testing
6887
m.containerImageErrors.Reset()
@@ -81,6 +100,16 @@ func TestErrorsReporting(t *testing.T) {
81100

82101
for i, tc := range testCases {
83102
t.Run(fmt.Sprintf("Case %d", i+1), func(t *testing.T) {
103+
err := fakek8s.DeleteAllOf(context.Background(), &corev1.Pod{})
104+
require.NoError(t, err)
105+
106+
// We need to ensure that the pod Exists!
107+
err = fakek8s.Create(context.Background(), &corev1.Pod{
108+
ObjectMeta: metav1.ObjectMeta{Name: tc.pod, Namespace: tc.namespace},
109+
Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: tc.container, Image: tc.image}}},
110+
})
111+
require.NoError(t, err)
112+
84113
// Report an error
85114
m.ReportError(tc.namespace, tc.pod, tc.container, tc.image)
86115

@@ -99,3 +128,66 @@ func TestErrorsReporting(t *testing.T) {
99128
})
100129
}
101130
}
131+
132+
func Test_Metrics_SkipOnDeletedPod(t *testing.T) {
133+
scheme := runtime.NewScheme()
134+
_ = corev1.AddToScheme(scheme)
135+
136+
// Step 1: Create fake client with Pod
137+
pod := &corev1.Pod{
138+
ObjectMeta: metav1.ObjectMeta{
139+
Name: "mypod",
140+
Namespace: "default",
141+
UID: types.UID("test-uid"),
142+
},
143+
}
144+
client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(pod).Build()
145+
146+
// Step 2: Create Metrics with fake registry
147+
reg := prometheus.NewRegistry()
148+
log := logrus.NewEntry(logrus.New())
149+
metrics := New(log, reg, client)
150+
151+
// verify Pod exists!
152+
require.True(t,
153+
metrics.PodExists(context.Background(), "default", "mypod"),
154+
"Pod should exist at this point!",
155+
)
156+
157+
// Register some metrics....
158+
metrics.RegisterImageDuration("default", "mypod", "mycontainer", "nginx:latest", time.Now())
159+
160+
// Step 3: Simulate a Delete occuring, Whilst still Reconciling...
161+
_ = client.Delete(context.Background(), pod)
162+
metrics.RemovePod("default", "mypod")
163+
164+
// Step 4: Validate that all metrics have been removed...
165+
metricFamilies, err := reg.Gather()
166+
assert.NoError(t, err)
167+
for _, mf := range metricFamilies {
168+
assert.NotContains(t, *mf.Name, "is_latest_version", "Should not have been found: %+v", mf)
169+
assert.NotContains(t, *mf.Name, "image_lookup_duration", "Should not have been found: %+v", mf)
170+
assert.NotContains(t, *mf.Name, "image_failures_total", "Should not have been found: %+v", mf)
171+
}
172+
173+
// Register Error _after_ sync has completed!
174+
metrics.ReportError("default", "mypod", "mycontianer", "nginx:latest")
175+
176+
// Step 5: Attempt to register metrics (should not register anything)
177+
require.False(t,
178+
metrics.PodExists(context.Background(), "default", "mypod"),
179+
"Pod should NOT exist at this point!",
180+
)
181+
182+
metrics.RegisterImageDuration("default", "mypod", "mycontainer", "nginx:latest", time.Now())
183+
metrics.ReportError("default", "mypod", "mycontianer", "nginx:latest")
184+
185+
// Step 6: Gather metrics and assert none were registered
186+
metricFamilies, err = reg.Gather()
187+
assert.NoError(t, err)
188+
for _, mf := range metricFamilies {
189+
assert.NotContains(t, *mf.Name, "is_latest_version", "Should not have been found: %+v", mf)
190+
assert.NotContains(t, *mf.Name, "image_lookup_duration", "Should not have been found: %+v", mf)
191+
assert.NotContains(t, *mf.Name, "image_failures_total", "Should not have been found: %+v", mf)
192+
}
193+
}

pkg/metrics/roundtripper_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ func TestRoundTripper(t *testing.T) {
164164

165165
for _, tt := range tests {
166166
t.Run(tt.name, func(t *testing.T) {
167-
metricsServer := New(log, prometheus.NewRegistry())
167+
metricsServer := New(log, prometheus.NewRegistry(), fakek8s)
168168
server := httptest.NewServer(tt.handler)
169169
defer server.Close()
170170

0 commit comments

Comments
 (0)