Skip to content

Commit 88ea3c2

Browse files
huntergregoryHunter Gregory
andauthored
Prometheus metrics (#590)
* prometheus additions to testmain (commented out right now) * home of the npm prometheus metrics and tools for updating them, testing them * add/remove policy metrics * add/remove iptables rule metric measurements * add/remove ipset metric measurements * testing for gauges. want to soon remove the boolean for including prometheus in unit testing * run http server that exposes prometheus from main * cleaner test additions with less code * removed incorrect instance of AddSet in the TestDeleteSet test * added prometheus annotations to pod templates * deleted unused file * much more organized initialization of metrics now. now includes map from metric to metric name * add ability to get summary count value. now getting gauge values and this new count value are done by passing the metric itself as a param instead of a string * condenses prometheus testing code base by condensing all prometheus error messages into a function * added testing for summary counts, condensed prometheus error handling code, and updated calls to use new form for getting metric values * update based on variable spelling change in metrics package * Added comments for functions and moved http handler code to the http file * fixed problem of registering same metric name for different metrics, and passing in the wrong param type for testing * made prometheus testing folder with interactive testing file. moved old random metric flux testing function over from ipsm_test * moved testing around again * fixed spelling mistake * counting mistake in unit test * handler variable ws in wrong file. Changed stdout printing to logging * fixed parameter errors and counting error in a test * moved utilities for testing prometheus metrics to npm/util. Updated StartHTTP to have an additional parameter for waiting after starting the server * updated uses of StartHTTP to have the extra parameter * updated GetValue and GetCountValue uses to use the prometheus features of the util package, which is now moved to a promutil package within npm/metrics/ * removed unnecessary comments, removed print statement, and added quantiles to all summary metrics * fixed problem of double registering metrics * wait longer for http server to start * moved tool in test-util.go to promutil/util.go * fixed timer to be in milliseconds and updated metric descriptions to mention units * removed unnecessary comments * http server always started in a go routine now. Added comment justifying the use of an http server * debugging http connection refused in pipeline * fixed syntax error * removed debugging wrapper around http service * sleep so that the testing metrics endpoint can be pinged * redesigned GetValue and GetCountValue so that they don't use http calls * removed random but helpful testing file - will write about quick testing in a wiki page * milliseconds were being truncated. now they have decimals * use direct Prometheus metric commands instead of wrapping them * removed code used when testing was done through http server. Moved registering to metric creation functions * added createGaugeVec, updated comments, made all help strings constants * added metric that counts number of entries in each ipset. still need to add tests * fixed creation of GaugeVecs, and use explicit labeling instead of order-based labeling now * updated GetVecValue method signature * added set to metrics on creation and wrote unit tests for CreateSet, AddToSet, DeleteFromSet, DeleteSet * use custom registry to limit content that Container Insights scrapes. Also log the start of http server * wrote TODO item comments for Restore and Destroy (currently these functions are only used in testing) * NPM won't crash if a Prometheus metric fails to register now (unlikely). Added logging for metric registration/creation, and explicit public function to initialize metrics so that we can finish log config first * initialize metrics in unit tests * renamed util.go to test-util.go Co-authored-by: Hunter Gregory <[email protected]>
1 parent b92d8c6 commit 88ea3c2

File tree

13 files changed

+402
-12
lines changed

13 files changed

+402
-12
lines changed

npm/azure-npm.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ roleRef:
4949
name: azure-npm
5050
apiGroup: rbac.authorization.k8s.io
5151
---
52-
apiVersion: extensions/v1beta1
52+
apiVersion: apps/v1
5353
kind: DaemonSet
5454
metadata:
5555
name: azure-npm
@@ -67,6 +67,8 @@ spec:
6767
k8s-app: azure-npm
6868
annotations:
6969
scheduler.alpha.kubernetes.io/critical-pod: ''
70+
prometheus.io/scrape: "true"
71+
prometheus.io/port: "8000"
7072
spec:
7173
priorityClassName: system-node-critical
7274
tolerations:

npm/ipsm/ipsm.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ import (
99
"syscall"
1010

1111
"github.com/Azure/azure-container-networking/log"
12+
"github.com/Azure/azure-container-networking/npm/metrics"
1213
"github.com/Azure/azure-container-networking/npm/util"
14+
"github.com/prometheus/client_golang/prometheus"
1315
)
1416

1517
type ipsEntry struct {
@@ -180,6 +182,8 @@ func (ipsMgr *IpsetManager) DeleteFromList(listName string, setName string) erro
180182

181183
// CreateSet creates an ipset.
182184
func (ipsMgr *IpsetManager) CreateSet(setName string, spec []string) error {
185+
timer := metrics.StartNewTimer()
186+
183187
if _, exists := ipsMgr.setMap[setName]; exists {
184188
return nil
185189
}
@@ -199,6 +203,10 @@ func (ipsMgr *IpsetManager) CreateSet(setName string, spec []string) error {
199203

200204
ipsMgr.setMap[setName] = NewIpset(setName)
201205

206+
metrics.NumIPSets.Inc()
207+
timer.StopAndRecord(metrics.AddIPSetExecTime)
208+
metrics.IPSetInventory.With(prometheus.Labels{metrics.SetNameLabel: setName}).Set(0)
209+
202210
return nil
203211
}
204212

@@ -225,6 +233,9 @@ func (ipsMgr *IpsetManager) DeleteSet(setName string) error {
225233

226234
delete(ipsMgr.setMap, setName)
227235

236+
metrics.NumIPSets.Dec()
237+
metrics.IPSetInventory.With(prometheus.Labels{metrics.SetNameLabel: setName}).Set(0)
238+
228239
return nil
229240
}
230241

@@ -269,6 +280,8 @@ func (ipsMgr *IpsetManager) AddToSet(setName, ip, spec, podUid string) error {
269280
// Stores the podUid as the context for this ip.
270281
ipsMgr.setMap[setName].elements[ip] = podUid
271282

283+
metrics.IPSetInventory.With(prometheus.Labels{metrics.SetNameLabel: setName}).Inc()
284+
272285
return nil
273286
}
274287

@@ -310,6 +323,8 @@ func (ipsMgr *IpsetManager) DeleteFromSet(setName, ip, podUid string) error {
310323
// Now cleanup the cache
311324
delete(ipsMgr.setMap[setName].elements, ip)
312325

326+
metrics.IPSetInventory.With(prometheus.Labels{metrics.SetNameLabel: setName}).Dec()
327+
313328
if len(ipsMgr.setMap[setName].elements) == 0 {
314329
ipsMgr.DeleteSet(setName)
315330
}
@@ -360,6 +375,8 @@ func (ipsMgr *IpsetManager) Destroy() error {
360375
return err
361376
}
362377

378+
//TODO set metrics.IPSetInventory to 0 for all set names
379+
363380
return nil
364381
}
365382

@@ -424,5 +441,7 @@ func (ipsMgr *IpsetManager) Restore(configFile string) error {
424441
}
425442
cmd.Wait()
426443

444+
//TODO based on the set name and number of entries in the config file, update metrics.IPSetInventory
445+
427446
return nil
428447
}

npm/ipsm/ipsm_test.go

Lines changed: 61 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@ import (
66
"os"
77
"testing"
88

9+
"github.com/Azure/azure-container-networking/npm/metrics"
10+
"github.com/Azure/azure-container-networking/npm/metrics/promutil"
911
"github.com/Azure/azure-container-networking/npm/util"
12+
"github.com/prometheus/client_golang/prometheus"
1013
)
1114

1215
func TestSave(t *testing.T) {
@@ -127,14 +130,34 @@ func TestCreateSet(t *testing.T) {
127130
}
128131
}()
129132

130-
if err := ipsMgr.CreateSet("test-set", []string{util.IpsetNetHashFlag}); err != nil {
133+
gaugeVal, err1 := promutil.GetValue(metrics.NumIPSets)
134+
countVal, err2 := promutil.GetCountValue(metrics.AddIPSetExecTime)
135+
136+
testSet1Name := "test-set"
137+
if err := ipsMgr.CreateSet(testSet1Name, []string{util.IpsetNetHashFlag}); err != nil {
131138
t.Errorf("TestCreateSet failed @ ipsMgr.CreateSet")
132139
}
133140

141+
testSet2Name := "test-set-with-maxelem"
134142
spec := append([]string{util.IpsetNetHashFlag, util.IpsetMaxelemName, util.IpsetMaxelemNum})
135-
if err := ipsMgr.CreateSet("test-set-with-maxelem", spec); err != nil {
143+
if err := ipsMgr.CreateSet(testSet2Name, spec); err != nil {
136144
t.Errorf("TestCreateSet failed @ ipsMgr.CreateSet when set maxelem")
137145
}
146+
147+
newGaugeVal, err3 := promutil.GetValue(metrics.NumIPSets)
148+
newCountVal, err4 := promutil.GetCountValue(metrics.AddIPSetExecTime)
149+
testSet1Count, err5 := promutil.GetVecValue(metrics.IPSetInventory, prometheus.Labels{metrics.SetNameLabel: testSet1Name})
150+
testSet2Count, err6 := promutil.GetVecValue(metrics.IPSetInventory, prometheus.Labels{metrics.SetNameLabel: testSet2Name})
151+
promutil.NotifyIfErrors(t, err1, err2, err3, err4, err5, err6)
152+
if newGaugeVal != gaugeVal+2 {
153+
t.Errorf("Change in ipset number didn't register in Prometheus")
154+
}
155+
if newCountVal != countVal+2 {
156+
t.Errorf("Execution time didn't register in Prometheus")
157+
}
158+
if testSet1Count != 0 || testSet2Count != 0 {
159+
t.Errorf("Prometheus IPSet count has incorrect number of entries")
160+
}
138161
}
139162

140163
func TestDeleteSet(t *testing.T) {
@@ -149,13 +172,26 @@ func TestDeleteSet(t *testing.T) {
149172
}
150173
}()
151174

152-
if err := ipsMgr.CreateSet("test-set", append([]string{util.IpsetNetHashFlag})); err != nil {
175+
testSetName := "test-set"
176+
if err := ipsMgr.CreateSet(testSetName, append([]string{util.IpsetNetHashFlag})); err != nil {
153177
t.Errorf("TestDeleteSet failed @ ipsMgr.CreateSet")
154178
}
155179

156-
if err := ipsMgr.DeleteSet("test-set"); err != nil {
180+
gaugeVal, err1 := promutil.GetValue(metrics.NumIPSets)
181+
182+
if err := ipsMgr.DeleteSet(testSetName); err != nil {
157183
t.Errorf("TestDeleteSet failed @ ipsMgr.DeleteSet")
158184
}
185+
186+
newGaugeVal, err2 := promutil.GetValue(metrics.NumIPSets)
187+
testSetCount, err3 := promutil.GetVecValue(metrics.IPSetInventory, prometheus.Labels{metrics.SetNameLabel: testSetName})
188+
promutil.NotifyIfErrors(t, err1, err2, err3)
189+
if newGaugeVal != gaugeVal-1 {
190+
t.Errorf("Change in ipset number didn't register in prometheus")
191+
}
192+
if testSetCount != 0 {
193+
t.Errorf("Prometheus IPSet count has incorrect number of entries")
194+
}
159195
}
160196

161197
func TestAddToSet(t *testing.T) {
@@ -170,13 +206,20 @@ func TestAddToSet(t *testing.T) {
170206
}
171207
}()
172208

173-
if err := ipsMgr.AddToSet("test-set", "1.2.3.4", util.IpsetNetHashFlag, ""); err != nil {
209+
testSetName := "test-set"
210+
if err := ipsMgr.AddToSet(testSetName, "1.2.3.4", util.IpsetNetHashFlag, ""); err != nil {
174211
t.Errorf("TestAddToSet failed @ ipsMgr.AddToSet")
175212
}
176213

177-
if err := ipsMgr.AddToSet("test-set", "1.2.3.4/nomatch", util.IpsetNetHashFlag, ""); err != nil {
214+
if err := ipsMgr.AddToSet(testSetName, "1.2.3.4/nomatch", util.IpsetNetHashFlag, ""); err != nil {
178215
t.Errorf("TestAddToSet with nomatch failed @ ipsMgr.AddToSet")
179216
}
217+
218+
testSetCount, err1 := promutil.GetVecValue(metrics.IPSetInventory, prometheus.Labels{metrics.SetNameLabel: testSetName})
219+
promutil.NotifyIfErrors(t, err1)
220+
if testSetCount != 2 {
221+
t.Errorf("Prometheus IPSet count has incorrect number of entries")
222+
}
180223
}
181224

182225
func TestAddToSetWithCachePodInfo(t *testing.T) {
@@ -231,22 +274,29 @@ func TestDeleteFromSet(t *testing.T) {
231274
}
232275
}()
233276

234-
if err := ipsMgr.AddToSet("test-set", "1.2.3.4", util.IpsetNetHashFlag, ""); err != nil {
277+
testSetName := "test-set"
278+
if err := ipsMgr.AddToSet(testSetName, "1.2.3.4", util.IpsetNetHashFlag, ""); err != nil {
235279
t.Errorf("TestDeleteFromSet failed @ ipsMgr.AddToSet")
236280
}
237281

238-
if len(ipsMgr.setMap["test-set"].elements) != 1 {
282+
if len(ipsMgr.setMap[testSetName].elements) != 1 {
239283
t.Errorf("TestDeleteFromSet failed @ ipsMgr.AddToSet")
240284
}
241285

242-
if err := ipsMgr.DeleteFromSet("test-set", "1.2.3.4", ""); err != nil {
286+
if err := ipsMgr.DeleteFromSet(testSetName, "1.2.3.4", ""); err != nil {
243287
t.Errorf("TestDeleteFromSet failed @ ipsMgr.DeleteFromSet")
244288
}
245289

246290
// After deleting the only entry, "1.2.3.4" from "test-set", "test-set" ipset won't exist
247-
if _, exists := ipsMgr.setMap["test-set"]; exists {
291+
if _, exists := ipsMgr.setMap[testSetName]; exists {
248292
t.Errorf("TestDeleteFromSet failed @ ipsMgr.DeleteFromSet")
249293
}
294+
295+
testSetCount, err1 := promutil.GetVecValue(metrics.IPSetInventory, prometheus.Labels{metrics.SetNameLabel: testSetName})
296+
promutil.NotifyIfErrors(t, err1)
297+
if testSetCount != 0 {
298+
t.Errorf("Prometheus IPSet count has incorrect number of entries")
299+
}
250300
}
251301

252302
func TestDeleteFromSetWithPodCache(t *testing.T) {
@@ -373,6 +423,7 @@ func TestRun(t *testing.T) {
373423
}
374424

375425
func TestMain(m *testing.M) {
426+
metrics.InitializeAll()
376427
ipsMgr := NewIpsetManager()
377428
ipsMgr.Save(util.IpsetConfigFile)
378429

npm/iptm/iptm.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"golang.org/x/sys/unix"
1818

1919
"github.com/Azure/azure-container-networking/log"
20+
"github.com/Azure/azure-container-networking/npm/metrics"
2021
"github.com/Azure/azure-container-networking/npm/util"
2122
"k8s.io/apimachinery/pkg/util/wait"
2223
// utiliptables "k8s.io/kubernetes/pkg/util/iptables"
@@ -298,6 +299,8 @@ func (iptMgr *IptablesManager) DeleteChain(chain string) error {
298299

299300
// Add adds a rule in iptables.
300301
func (iptMgr *IptablesManager) Add(entry *IptEntry) error {
302+
timer := metrics.StartNewTimer()
303+
301304
log.Logf("Adding iptables entry: %+v.", entry)
302305

303306
if entry.IsJumpEntry {
@@ -310,6 +313,9 @@ func (iptMgr *IptablesManager) Add(entry *IptEntry) error {
310313
return err
311314
}
312315

316+
metrics.NumIPTableRules.Inc()
317+
timer.StopAndRecord(metrics.AddIPTableRuleExecTime)
318+
313319
return nil
314320
}
315321

@@ -332,6 +338,8 @@ func (iptMgr *IptablesManager) Delete(entry *IptEntry) error {
332338
return err
333339
}
334340

341+
metrics.NumIPTableRules.Dec()
342+
335343
return nil
336344
}
337345

npm/iptm/iptm_test.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
package iptm
22

33
import (
4-
"testing"
54
"os"
5+
"testing"
66

7+
"github.com/Azure/azure-container-networking/npm/metrics"
8+
"github.com/Azure/azure-container-networking/npm/metrics/promutil"
79
"github.com/Azure/azure-container-networking/npm/util"
810
)
911

@@ -147,9 +149,23 @@ func TestAdd(t *testing.T) {
147149
util.IptablesReject,
148150
},
149151
}
152+
153+
gaugeVal, err1 := promutil.GetValue(metrics.NumIPTableRules)
154+
countVal, err2 := promutil.GetCountValue(metrics.AddIPTableRuleExecTime)
155+
150156
if err := iptMgr.Add(entry); err != nil {
151157
t.Errorf("TestAdd failed @ iptMgr.Add")
152158
}
159+
160+
newGaugeVal, err3 := promutil.GetValue(metrics.NumIPTableRules)
161+
newCountVal, err4 := promutil.GetCountValue(metrics.AddIPTableRuleExecTime)
162+
promutil.NotifyIfErrors(t, err1, err2, err3, err4)
163+
if newGaugeVal != gaugeVal+1 {
164+
t.Errorf("Change in iptable rule number didn't register in prometheus")
165+
}
166+
if newCountVal != countVal+1 {
167+
t.Errorf("Execution time didn't register in prometheus")
168+
}
153169
}
154170

155171
func TestDelete(t *testing.T) {
@@ -175,9 +191,17 @@ func TestDelete(t *testing.T) {
175191
t.Errorf("TestDelete failed @ iptMgr.Add")
176192
}
177193

194+
gaugeVal, err1 := promutil.GetValue(metrics.NumIPTableRules)
195+
178196
if err := iptMgr.Delete(entry); err != nil {
179197
t.Errorf("TestDelete failed @ iptMgr.Delete")
180198
}
199+
200+
newGaugeVal, err2 := promutil.GetValue(metrics.NumIPTableRules)
201+
promutil.NotifyIfErrors(t, err1, err2)
202+
if newGaugeVal != gaugeVal-1 {
203+
t.Errorf("Change in iptable rule number didn't register in prometheus")
204+
}
181205
}
182206

183207
func TestRun(t *testing.T) {
@@ -202,6 +226,7 @@ func TestRun(t *testing.T) {
202226
}
203227

204228
func TestMain(m *testing.M) {
229+
metrics.InitializeAll()
205230
iptMgr := NewIptablesManager()
206231
iptMgr.Save(util.IptablesConfigFile)
207232

npm/metrics/http.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package metrics
2+
3+
import (
4+
"net/http"
5+
"time"
6+
7+
"github.com/Azure/azure-container-networking/log"
8+
"github.com/prometheus/client_golang/prometheus/promhttp"
9+
)
10+
11+
const (
12+
// HTTPPort is the port used by the HTTP server (includes a preceding colon)
13+
HTTPPort = ":8000"
14+
15+
//MetricsPath is the path for the Prometheus metrics endpoint (includes preceding slash)
16+
MetricsPath = "/metrics"
17+
)
18+
19+
var started = false
20+
var handler http.Handler
21+
22+
// StartHTTP starts a HTTP server in a Go routine with endpoint on port 8000. Metrics are exposed on the endpoint /metrics.
23+
// By being exposed, the metrics can be scraped by a Prometheus Server or Container Insights.
24+
// The function will pause for delayAmountAfterStart seconds after starting the HTTP server for the first time.
25+
func StartHTTP(delayAmountAfterStart int) {
26+
if started {
27+
return
28+
}
29+
started = true
30+
31+
http.Handle(MetricsPath, getHandler())
32+
log.Logf("Starting Prometheus HTTP Server")
33+
go http.ListenAndServe(HTTPPort, nil)
34+
time.Sleep(time.Second * time.Duration(delayAmountAfterStart))
35+
}
36+
37+
// getHandler returns the HTTP handler for the metrics endpoint
38+
func getHandler() http.Handler {
39+
if handler == nil {
40+
handler = promhttp.HandlerFor(registry, promhttp.HandlerOpts{})
41+
}
42+
return handler
43+
}

0 commit comments

Comments
 (0)