Skip to content

Commit 443913e

Browse files
authored
Merge pull request #1956 from googs1025/chore/add_metrics_prefix
chore: add metrics system prefix
2 parents 0efd38e + e631a52 commit 443913e

File tree

6 files changed

+117
-79
lines changed

6 files changed

+117
-79
lines changed

docs/deployment/metrics.md

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,25 @@ By default NFD Master and Worker expose metrics on port 8081.
1313

1414
The exposed metrics are
1515

16-
| Metric | Type | Description |
17-
| ------------------------------------------------- | --------- | ------------------------------------------------------- |
18-
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built |
19-
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built |
20-
| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built |
21-
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built |
22-
| `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC |
23-
| `nfd_node_updates_total` | Counter | Number of nodes updated |
24-
| `nfd_node_update_failures_total` | Counter | Number of nodes update failures |
25-
| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master |
26-
| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master |
27-
| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master |
28-
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects |
29-
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects |
30-
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node |
31-
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. |
32-
| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected. |
33-
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects. |
16+
| Metric | Type | Description |
17+
| -------------------------------------------------------- | --------- | -------------------------------------------------------------------------- |
18+
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built |
19+
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built |
20+
| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built |
21+
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built |
22+
| `nfd_master_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC |
23+
| `nfd_master_node_updates_total` | Counter | Number of nodes updated |
24+
| `nfd_master_node_feature_group_update_requests_total` | Counter | Number of cluster feature update requests processed by the master |
25+
| `nfd_master_node_update_failures_total` | Counter | Number of nodes update failures |
26+
| `nfd_master_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master |
27+
| `nfd_master_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master |
28+
| `nfd_master_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master |
29+
| `nfd_master_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects |
30+
| `nfd_master_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects |
31+
| `nfd_worker_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node |
32+
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. |
33+
| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected. |
34+
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects. |
3435

3536
## Kustomize
3637

examples/grafana-dashboard.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@
391391
},
392392
"disableTextWrap": false,
393393
"editorMode": "builder",
394-
"expr": "nfd_node_updates_total",
394+
"expr": "nfd_master_node_updates_total",
395395
"fullMetaSearch": false,
396396
"hide": false,
397397
"includeNullMetadata": true,
@@ -586,7 +586,7 @@
586586
"uid": "prometheus"
587587
},
588588
"editorMode": "builder",
589-
"expr": "nfd_node_update_failures_total",
589+
"expr": "nfd_master_node_update_failures_total",
590590
"legendFormat": "total",
591591
"range": true,
592592
"refId": "A"
@@ -679,7 +679,7 @@
679679
"uid": "prometheus"
680680
},
681681
"editorMode": "builder",
682-
"expr": "nfd_nodefeaturerule_processing_errors_total",
682+
"expr": "nfd_master_nodefeaturerule_processing_errors_total",
683683
"hide": false,
684684
"legendFormat": "total",
685685
"range": true,
@@ -940,7 +940,7 @@
940940
"uid": "prometheus"
941941
},
942942
"editorMode": "builder",
943-
"expr": "sum by(le) (nfd_feature_discovery_duration_seconds_bucket)",
943+
"expr": "sum by(le) (nfd_worker_feature_discovery_duration_seconds_bucket)",
944944
"format": "heatmap",
945945
"legendFormat": "__auto",
946946
"range": true,
@@ -1007,7 +1007,7 @@
10071007
"uid": "prometheus"
10081008
},
10091009
"editorMode": "builder",
1010-
"expr": "sum by(le) (nfd_nodefeaturerule_processing_duration_seconds_bucket)",
1010+
"expr": "sum by(le) (nfd_master_nodefeaturerule_processing_duration_seconds_bucket)",
10111011
"format": "heatmap",
10121012
"legendFormat": "__auto",
10131013
"range": true,
@@ -1101,7 +1101,7 @@
11011101
"uid": "prometheus"
11021102
},
11031103
"editorMode": "builder",
1104-
"expr": "nfd_node_labels_rejected_total",
1104+
"expr": "nfd_master_node_labels_rejected_total",
11051105
"legendFormat": "total",
11061106
"range": true,
11071107
"refId": "A"
@@ -1194,7 +1194,7 @@
11941194
"uid": "prometheus"
11951195
},
11961196
"editorMode": "builder",
1197-
"expr": "nfd_node_extendedresources_rejected_total",
1197+
"expr": "nfd_master_node_extendedresources_rejected_total",
11981198
"hide": false,
11991199
"legendFormat": "total",
12001200
"range": true,
@@ -1288,7 +1288,7 @@
12881288
"uid": "prometheus"
12891289
},
12901290
"editorMode": "builder",
1291-
"expr": "nfd_node_taints_rejected_total",
1291+
"expr": "nfd_master_node_taints_rejected_total",
12921292
"hide": false,
12931293
"legendFormat": "total",
12941294
"range": true,

pkg/nfd-gc/metrics.go

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,27 +23,35 @@ import (
2323

2424
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
2525
const (
26-
buildInfoQuery = "nfd_gc_build_info"
27-
objectsDeletedQuery = "nfd_gc_objects_deleted_total"
28-
objectDeleteErrorsQuery = "nfd_gc_object_delete_failures_total"
26+
buildInfoQuery = "build_info"
27+
objectsDeletedQuery = "objects_deleted_total"
28+
objectDeleteErrorsQuery = "object_delete_failures_total"
29+
)
30+
31+
const (
32+
// nfdGCPrefix - subsystem name used by nfd gc.
33+
nfdGCPrefix = "nfd_gc"
2934
)
3035

3136
var (
3237
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
33-
Name: buildInfoQuery,
34-
Help: "Version from which Node Feature Discovery was built.",
38+
Subsystem: nfdGCPrefix,
39+
Name: buildInfoQuery,
40+
Help: "Version from which Node Feature Discovery was built.",
3541
ConstLabels: map[string]string{
3642
"version": version.Get(),
3743
},
3844
})
3945
objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{
40-
Name: objectsDeletedQuery,
41-
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."},
46+
Subsystem: nfdGCPrefix,
47+
Name: objectsDeletedQuery,
48+
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."},
4249
[]string{"kind"},
4350
)
4451
objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
45-
Name: objectDeleteErrorsQuery,
46-
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."},
52+
Subsystem: nfdGCPrefix,
53+
Name: objectDeleteErrorsQuery,
54+
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."},
4755
[]string{"kind"},
4856
)
4957
)

pkg/nfd-master/metrics.go

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,68 +23,83 @@ import (
2323

2424
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
2525
const (
26-
buildInfoQuery = "nfd_master_build_info"
27-
nodeUpdateRequestsQuery = "nfd_node_update_requests_total"
28-
nodeUpdatesQuery = "nfd_node_updates_total"
29-
nodeFeatureGroupUpdateRequestsQuery = "nfd_node_feature_group_update_requests_total"
30-
nodeUpdateFailuresQuery = "nfd_node_update_failures_total"
31-
nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total"
32-
nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total"
33-
nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total"
34-
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
35-
nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total"
26+
buildInfoQuery = "build_info"
27+
nodeUpdateRequestsQuery = "node_update_requests_total"
28+
nodeUpdatesQuery = "node_updates_total"
29+
nodeFeatureGroupUpdateRequestsQuery = "node_feature_group_update_requests_total"
30+
nodeUpdateFailuresQuery = "node_update_failures_total"
31+
nodeLabelsRejectedQuery = "node_labels_rejected_total"
32+
nodeERsRejectedQuery = "node_extendedresources_rejected_total"
33+
nodeTaintsRejectedQuery = "node_taints_rejected_total"
34+
nfrProcessingTimeQuery = "nodefeaturerule_processing_duration_seconds"
35+
nfrProcessingErrorsQuery = "nodefeaturerule_processing_errors_total"
36+
)
37+
38+
const (
39+
// nfdMasterPrefix - subsystem name used by nfd master.
40+
nfdMasterPrefix = "nfd_master"
3641
)
3742

3843
var (
3944
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
40-
Name: buildInfoQuery,
41-
Help: "Version from which Node Feature Discovery was built.",
45+
Subsystem: nfdMasterPrefix,
46+
Name: buildInfoQuery,
47+
Help: "Version from which Node Feature Discovery was built.",
4248
ConstLabels: map[string]string{
4349
"version": version.Get(),
4450
},
4551
})
4652
nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
47-
Name: nodeUpdateRequestsQuery,
48-
Help: "Number of node update requests processed by the master.",
53+
Subsystem: nfdMasterPrefix,
54+
Name: nodeUpdateRequestsQuery,
55+
Help: "Number of node update requests processed by the master.",
4956
})
5057
nodeFeatureGroupUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
51-
Name: nodeFeatureGroupUpdateRequestsQuery,
52-
Help: "Number of cluster feature update requests processed by the master.",
58+
Subsystem: nfdMasterPrefix,
59+
Name: nodeFeatureGroupUpdateRequestsQuery,
60+
Help: "Number of cluster feature update requests processed by the master.",
5361
})
5462
nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{
55-
Name: nodeUpdatesQuery,
56-
Help: "Number of nodes updated by the master.",
63+
Subsystem: nfdMasterPrefix,
64+
Name: nodeUpdatesQuery,
65+
Help: "Number of nodes updated by the master.",
5766
})
5867
nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{
59-
Name: nodeUpdateFailuresQuery,
60-
Help: "Number of node update failures.",
68+
Subsystem: nfdMasterPrefix,
69+
Name: nodeUpdateFailuresQuery,
70+
Help: "Number of node update failures.",
6171
})
6272
nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{
63-
Name: nodeLabelsRejectedQuery,
64-
Help: "Number of node labels that were rejected by nfd-master.",
73+
Subsystem: nfdMasterPrefix,
74+
Name: nodeLabelsRejectedQuery,
75+
Help: "Number of node labels that were rejected by nfd-master.",
6576
})
6677
nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{
67-
Name: nodeERsRejectedQuery,
68-
Help: "Number of node extended resources that were rejected by nfd-master.",
78+
Subsystem: nfdMasterPrefix,
79+
Name: nodeERsRejectedQuery,
80+
Help: "Number of node extended resources that were rejected by nfd-master.",
6981
})
7082
nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{
71-
Name: nodeTaintsRejectedQuery,
72-
Help: "Number of node taints that were rejected by nfd-master.",
83+
Subsystem: nfdMasterPrefix,
84+
Name: nodeTaintsRejectedQuery,
85+
Help: "Number of node taints that were rejected by nfd-master.",
7386
})
7487
nfrProcessingTime = prometheus.NewHistogramVec(
7588
prometheus.HistogramOpts{
76-
Name: nfrProcessingTimeQuery,
77-
Help: "Time processing time of NodeFeatureRule objects.",
78-
Buckets: []float64{0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01},
89+
Subsystem: nfdMasterPrefix,
90+
Name: nfrProcessingTimeQuery,
91+
Help: "Time processing time of NodeFeatureRule objects.",
92+
Buckets: []float64{0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01},
7993
},
8094
[]string{
8195
"name",
8296
"node",
8397
},
8498
)
8599
nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{
86-
Name: nfrProcessingErrorsQuery,
87-
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
100+
Subsystem: nfdMasterPrefix,
101+
Name: nfrProcessingErrorsQuery,
102+
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
88103
})
89104
)
90105

pkg/nfd-topology-updater/metrics.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,28 @@ import (
2323

2424
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
2525
const (
26-
buildInfoQuery = "nfd_topology_updater_build_info"
27-
scanErrorsQuery = "nfd_topology_updater_scan_errors_total"
26+
buildInfoQuery = "build_info"
27+
scanErrorsQuery = "scan_errors_total"
28+
)
29+
30+
const (
31+
// nfdTopologyUpdaterPrefix - subsystem name used by nfd topology updater.
32+
nfdTopologyUpdaterPrefix = "nfd_topology_updater"
2833
)
2934

3035
var (
3136
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
32-
Name: buildInfoQuery,
33-
Help: "Version from which Node Feature Discovery was built.",
37+
Subsystem: nfdTopologyUpdaterPrefix,
38+
Name: buildInfoQuery,
39+
Help: "Version from which Node Feature Discovery was built.",
3440
ConstLabels: map[string]string{
3541
"version": version.Get(),
3642
},
3743
})
3844
scanErrors = prometheus.NewCounter(prometheus.CounterOpts{
39-
Name: scanErrorsQuery,
40-
Help: "Number of errors in scanning resource allocation of pods.",
45+
Subsystem: nfdTopologyUpdaterPrefix,
46+
Name: scanErrorsQuery,
47+
Help: "Number of errors in scanning resource allocation of pods.",
4148
})
4249
)
4350

pkg/nfd-worker/metrics.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,29 @@ import (
2323

2424
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
2525
const (
26-
buildInfoQuery = "nfd_worker_build_info"
27-
featureDiscoveryDurationQuery = "nfd_feature_discovery_duration_seconds"
26+
buildInfoQuery = "build_info"
27+
featureDiscoveryDurationQuery = "feature_discovery_duration_seconds"
28+
)
29+
30+
const (
31+
// nfdWorkerPrefix - subsystem name used by nfd worker.
32+
nfdWorkerPrefix = "nfd_worker"
2833
)
2934

3035
var (
3136
featureDiscoveryDuration = prometheus.NewHistogramVec(
3237
prometheus.HistogramOpts{
33-
Name: featureDiscoveryDurationQuery,
34-
Help: "Time taken to discover features",
35-
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1},
38+
Subsystem: nfdWorkerPrefix,
39+
Name: featureDiscoveryDurationQuery,
40+
Help: "Time taken to discover features",
41+
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1},
3642
},
3743
[]string{"node"},
3844
)
3945
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
40-
Name: buildInfoQuery,
41-
Help: "Version from which Node Feature Discovery was built.",
46+
Subsystem: nfdWorkerPrefix,
47+
Name: buildInfoQuery,
48+
Help: "Version from which Node Feature Discovery was built.",
4249
ConstLabels: map[string]string{
4350
"version": version.Get(),
4451
},

0 commit comments

Comments
 (0)