Skip to content

Commit 4639a65

Browse files
author
David Collom
committed
Fix issue where by metrics _could_ be registered *after* the pod has been deleted
1 parent 83c8fd2 commit 4639a65

File tree

6 files changed

+149
-17
lines changed

6 files changed

+149
-17
lines changed

cmd/app/app.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func NewCommand(ctx context.Context) *cobra.Command {
9898
log.Fatal("Unable to set up ready check:", err)
9999
}
100100

101-
metricsServer := metrics.New(log, ctrmetrics.Registry)
101+
metricsServer := metrics.New(log, ctrmetrics.Registry, mgr.GetCache())
102102

103103
opts.Client.Transport = transport.Chain(
104104
cleanhttp.DefaultTransport(),

pkg/controller/pod_controller_test.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,11 @@ func init() {
3434

3535
func TestNewController(t *testing.T) {
3636
kubeClient := fake.NewFakeClient()
37-
metrics := &metrics.Metrics{}
37+
metrics := metrics.New(
38+
logrus.NewEntry(logrus.StandardLogger()),
39+
prometheus.NewRegistry(),
40+
kubeClient,
41+
)
3842
imageClient := &client.Client{}
3943

4044
controller := NewPodReconciler(5*time.Minute, metrics, imageClient, kubeClient, testLogger, true)
@@ -78,6 +82,7 @@ func TestReconcile(t *testing.T) {
7882
metrics := metrics.New(
7983
logrus.NewEntry(logrus.StandardLogger()),
8084
prometheus.NewRegistry(),
85+
kubeClient,
8186
)
8287
controller := NewPodReconciler(5*time.Minute, metrics, imageClient, kubeClient, testLogger, true)
8388
controller.RequeueDuration = 5 * time.Minute
@@ -110,7 +115,11 @@ func TestReconcile(t *testing.T) {
110115
}
111116
func TestSetupWithManager(t *testing.T) {
112117
kubeClient := fake.NewClientBuilder().Build()
113-
metrics := &metrics.Metrics{}
118+
metrics := metrics.New(
119+
logrus.NewEntry(logrus.StandardLogger()),
120+
prometheus.NewRegistry(),
121+
kubeClient,
122+
)
114123
imageClient := &client.Client{}
115124
controller := NewPodReconciler(5*time.Minute, metrics, imageClient, kubeClient, testLogger, true)
116125

pkg/controller/pod_sync_test.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/stretchr/testify/assert"
1010
corev1 "k8s.io/api/core/v1"
1111
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
12+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
1213

1314
"github.com/prometheus/client_golang/prometheus"
1415

@@ -24,8 +25,13 @@ import (
2425
// Test for the sync method.
2526
func TestController_Sync(t *testing.T) {
2627
t.Parallel()
28+
2729
log := logrus.NewEntry(logrus.New())
28-
metrics := metrics.New(log, prometheus.NewRegistry())
30+
metrics := metrics.New(
31+
logrus.NewEntry(logrus.StandardLogger()),
32+
prometheus.NewRegistry(),
33+
fake.NewFakeClient(),
34+
)
2935
imageClient := &client.Client{}
3036
searcher := search.New(log, 5*time.Minute, version.New(log, imageClient, 5*time.Minute))
3137
checker := checker.New(searcher)
@@ -60,7 +66,7 @@ func TestController_Sync(t *testing.T) {
6066
func TestController_SyncContainer(t *testing.T) {
6167
t.Parallel()
6268
log := logrus.NewEntry(logrus.New())
63-
metrics := metrics.New(log, prometheus.NewRegistry())
69+
metrics := metrics.New(log, prometheus.NewRegistry(), fake.NewFakeClient())
6470
imageClient := &client.Client{}
6571
searcher := search.New(log, 5*time.Minute, version.New(log, imageClient, 5*time.Minute))
6672
checker := checker.New(searcher)
@@ -92,7 +98,7 @@ func TestController_SyncContainer(t *testing.T) {
9298
func TestController_CheckContainer(t *testing.T) {
9399
t.Parallel()
94100
log := logrus.NewEntry(logrus.New())
95-
metrics := metrics.New(log, prometheus.NewRegistry())
101+
metrics := metrics.New(log, prometheus.NewRegistry(), fake.NewFakeClient())
96102
imageClient := &client.Client{}
97103
searcher := search.New(log, 5*time.Minute, version.New(log, imageClient, 5*time.Minute))
98104
checker := checker.New(searcher)
@@ -122,7 +128,7 @@ func TestController_SyncContainer_NoVersionFound(t *testing.T) {
122128
t.Parallel()
123129

124130
log := logrus.NewEntry(logrus.New())
125-
metrics := metrics.New(log, prometheus.NewRegistry())
131+
metrics := metrics.New(log, prometheus.NewRegistry(), fake.NewFakeClient())
126132
imageClient := &client.Client{}
127133
searcher := search.New(log, 5*time.Minute, version.New(log, imageClient, 5*time.Minute))
128134
checker := checker.New(searcher)

pkg/metrics/metrics.go

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
package metrics
22

33
import (
4+
"context"
45
"sync"
56
"time"
67

78
"github.com/sirupsen/logrus"
89

10+
corev1 "k8s.io/api/core/v1"
11+
"k8s.io/apimachinery/pkg/types"
12+
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
13+
914
"github.com/prometheus/client_golang/prometheus"
1015
"github.com/prometheus/client_golang/prometheus/collectors"
1116
"github.com/prometheus/client_golang/prometheus/promauto"
@@ -22,13 +27,16 @@ type Metrics struct {
2227
containerImageDuration *prometheus.GaugeVec
2328
containerImageErrors *prometheus.CounterVec
2429

30+
cache k8sclient.Reader
31+
2532
// Contains all metrics for the roundtripper
2633
roundTripper *RoundTripper
2734

2835
mu sync.Mutex
2936
}
3037

31-
func New(log *logrus.Entry, reg ctrmetrics.RegistererGatherer) *Metrics {
38+
// func New(log *logrus.Entry, reg ctrmetrics.RegistererGatherer, kubeClient k8sclient.Client) *Metrics {
39+
func New(log *logrus.Entry, reg ctrmetrics.RegistererGatherer, cache k8sclient.Reader) *Metrics {
3240
// Attempt to register, but ignore errors
3341
_ = reg.Register(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
3442
_ = reg.Register(collectors.NewGoCollector())
@@ -63,7 +71,9 @@ func New(log *logrus.Entry, reg ctrmetrics.RegistererGatherer) *Metrics {
6371
)
6472

6573
return &Metrics{
66-
log: log.WithField("module", "metrics"),
74+
log: log.WithField("module", "metrics"),
75+
cache: cache,
76+
6777
registry: reg,
6878
containerImageVersion: containerImageVersion,
6979
containerImageDuration: containerImageDuration,
@@ -125,6 +135,11 @@ func (m *Metrics) RegisterImageDuration(namespace, pod, container, image string,
125135
m.mu.Lock()
126136
defer m.mu.Unlock()
127137

138+
if !m.PodExists(context.Background(), namespace, pod) {
139+
m.log.WithField("metric", "RegisterImageDuration").Warnf("pod %s/%s not found, not registering error", namespace, pod)
140+
return
141+
}
142+
128143
m.containerImageDuration.WithLabelValues(
129144
namespace, pod, container, image,
130145
).Set(time.Since(startTime).Seconds())
@@ -134,6 +149,11 @@ func (m *Metrics) ReportError(namespace, pod, container, imageURL string) {
134149
m.mu.Lock()
135150
defer m.mu.Unlock()
136151

152+
if !m.PodExists(context.Background(), namespace, pod) {
153+
m.log.WithField("metric", "ReportError").Warnf("pod %s/%s not found, not registering error", namespace, pod)
154+
return
155+
}
156+
137157
m.containerImageErrors.WithLabelValues(
138158
namespace, pod, container, imageURL,
139159
).Inc()
@@ -157,3 +177,10 @@ func (m *Metrics) buildPartialLabels(namespace, pod string) prometheus.Labels {
157177
"pod": pod,
158178
}
159179
}
180+
181+
// This _should_ leverage the Controllers Cache
182+
func (m *Metrics) PodExists(ctx context.Context, ns, name string) bool {
183+
pod := &corev1.Pod{}
184+
err := m.cache.Get(ctx, types.NamespacedName{Name: name, Namespace: ns}, pod)
185+
return err == nil && pod.GetDeletionTimestamp() == nil
186+
}

pkg/metrics/metrics_test.go

Lines changed: 97 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,30 @@
11
package metrics
22

33
import (
4+
"context"
45
"fmt"
56
"testing"
7+
"time"
68

7-
"github.com/prometheus/client_golang/prometheus"
8-
"github.com/prometheus/client_golang/prometheus/testutil"
99
"github.com/sirupsen/logrus"
1010
"github.com/stretchr/testify/assert"
11+
"github.com/stretchr/testify/require"
12+
13+
"k8s.io/apimachinery/pkg/runtime"
14+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
15+
16+
"github.com/prometheus/client_golang/prometheus"
17+
"github.com/prometheus/client_golang/prometheus/testutil"
18+
19+
corev1 "k8s.io/api/core/v1"
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
types "k8s.io/apimachinery/pkg/types"
1122
)
1223

24+
var fakek8s = fake.NewFakeClient()
25+
1326
func TestCache(t *testing.T) {
14-
m := New(logrus.NewEntry(logrus.New()), prometheus.NewRegistry())
27+
m := New(logrus.NewEntry(logrus.New()), prometheus.NewRegistry(), fakek8s)
1528

1629
for i, typ := range []string{"init", "container"} {
1730
version := fmt.Sprintf("0.1.%d", i)
@@ -20,7 +33,9 @@ func TestCache(t *testing.T) {
2033

2134
for i, typ := range []string{"init", "container"} {
2235
version := fmt.Sprintf("0.1.%d", i)
23-
mt, _ := m.containerImageVersion.GetMetricWith(m.buildLabels("namespace", "pod", "container", typ, "url", version, version))
36+
mt, _ := m.containerImageVersion.GetMetricWith(
37+
m.buildLabels("namespace", "pod", "container", typ, "url", version, version),
38+
)
2439
count := testutil.ToFloat64(mt)
2540
assert.Equal(t, count, float64(1), "Expected to get a metric for containerImageVersion")
2641
}
@@ -30,15 +45,17 @@ func TestCache(t *testing.T) {
3045
}
3146
for i, typ := range []string{"init", "container"} {
3247
version := fmt.Sprintf("0.1.%d", i)
33-
mt, _ := m.containerImageVersion.GetMetricWith(m.buildLabels("namespace", "pod", "container", typ, "url", version, version))
48+
mt, _ := m.containerImageVersion.GetMetricWith(
49+
m.buildLabels("namespace", "pod", "container", typ, "url", version, version),
50+
)
3451
count := testutil.ToFloat64(mt)
35-
assert.Equal(t, count, float64(0), "Expected to get a metric for containerImageVersion")
52+
assert.Equal(t, count, float64(0), "Expected NOT to get a metric for containerImageVersion")
3653
}
3754
}
3855

3956
// TestErrorsReporting verifies that the error metric increments correctly
4057
func TestErrorsReporting(t *testing.T) {
41-
m := New(logrus.NewEntry(logrus.New()), prometheus.NewRegistry())
58+
m := New(logrus.NewEntry(logrus.New()), prometheus.NewRegistry(), fakek8s)
4259

4360
// Reset the metrics before testing
4461
m.containerImageErrors.Reset()
@@ -57,6 +74,16 @@ func TestErrorsReporting(t *testing.T) {
5774

5875
for i, tc := range testCases {
5976
t.Run(fmt.Sprintf("Case %d", i+1), func(t *testing.T) {
77+
err := fakek8s.DeleteAllOf(context.Background(), &corev1.Pod{})
78+
require.NoError(t, err)
79+
80+
// We need to ensure that the pod Exists!
81+
err = fakek8s.Create(context.Background(), &corev1.Pod{
82+
ObjectMeta: metav1.ObjectMeta{Name: tc.pod, Namespace: tc.namespace},
83+
Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: tc.container, Image: tc.image}}},
84+
})
85+
require.NoError(t, err)
86+
6087
// Report an error
6188
m.ReportError(tc.namespace, tc.pod, tc.container, tc.image)
6289

@@ -75,3 +102,66 @@ func TestErrorsReporting(t *testing.T) {
75102
})
76103
}
77104
}
105+
106+
func Test_Metrics_SkipOnDeletedPod(t *testing.T) {
107+
scheme := runtime.NewScheme()
108+
_ = corev1.AddToScheme(scheme)
109+
110+
// Step 1: Create fake client with Pod
111+
pod := &corev1.Pod{
112+
ObjectMeta: metav1.ObjectMeta{
113+
Name: "mypod",
114+
Namespace: "default",
115+
UID: types.UID("test-uid"),
116+
},
117+
}
118+
client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(pod).Build()
119+
120+
// Step 2: Create Metrics with fake registry
121+
reg := prometheus.NewRegistry()
122+
log := logrus.NewEntry(logrus.New())
123+
metrics := New(log, reg, client)
124+
125+
// verify Pod exists!
126+
require.True(t,
127+
metrics.PodExists(context.Background(), "default", "mypod"),
128+
"Pod should exist at this point!",
129+
)
130+
131+
// Register some metrics....
132+
metrics.RegisterImageDuration("default", "mypod", "mycontainer", "nginx:latest", time.Now())
133+
134+
// Step 3: Simulate a Delete occuring, Whilst still Reconciling...
135+
_ = client.Delete(context.Background(), pod)
136+
metrics.RemovePod("default", "mypod")
137+
138+
// Step 4: Validate that all metrics have been removed...
139+
metricFamilies, err := reg.Gather()
140+
assert.NoError(t, err)
141+
for _, mf := range metricFamilies {
142+
assert.NotContains(t, *mf.Name, "is_latest_version", "Should not have been found: %+v", mf)
143+
assert.NotContains(t, *mf.Name, "image_lookup_duration", "Should not have been found: %+v", mf)
144+
assert.NotContains(t, *mf.Name, "image_failures_total", "Should not have been found: %+v", mf)
145+
}
146+
147+
// Register Error _after_ sync has completed!
148+
metrics.ReportError("default", "mypod", "mycontianer", "nginx:latest")
149+
150+
// Step 5: Attempt to register metrics (should not register anything)
151+
require.False(t,
152+
metrics.PodExists(context.Background(), "default", "mypod"),
153+
"Pod should NOT exist at this point!",
154+
)
155+
156+
metrics.RegisterImageDuration("default", "mypod", "mycontainer", "nginx:latest", time.Now())
157+
metrics.ReportError("default", "mypod", "mycontianer", "nginx:latest")
158+
159+
// Step 6: Gather metrics and assert none were registered
160+
metricFamilies, err = reg.Gather()
161+
assert.NoError(t, err)
162+
for _, mf := range metricFamilies {
163+
assert.NotContains(t, *mf.Name, "is_latest_version", "Should not have been found: %+v", mf)
164+
assert.NotContains(t, *mf.Name, "image_lookup_duration", "Should not have been found: %+v", mf)
165+
assert.NotContains(t, *mf.Name, "image_failures_total", "Should not have been found: %+v", mf)
166+
}
167+
}

pkg/metrics/roundtripper_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ func TestRoundTripper(t *testing.T) {
164164

165165
for _, tt := range tests {
166166
t.Run(tt.name, func(t *testing.T) {
167-
metricsServer := New(log, prometheus.NewRegistry())
167+
metricsServer := New(log, prometheus.NewRegistry(), fakek8s)
168168
server := httptest.NewServer(tt.handler)
169169
defer server.Close()
170170

0 commit comments

Comments
 (0)