Skip to content

Commit c927bf5

Browse files
authored
Merge pull request #1407 from marquiz/devel/gc-metrics
nfd-gc: add metrics
2 parents 44b26e3 + 98c3b07 commit c927bf5

File tree

8 files changed

+89
-3
lines changed

8 files changed

+89
-3
lines changed

cmd/nfd-gc/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ func initFlags(flagset *flag.FlagSet) *nfdgarbagecollector.Args {
8383
"interval between cleanup of obsolete api objects")
8484
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
8585
"Kubeconfig to use")
86+
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
87+
"Port on which to expose metrics.")
8688

8789
klog.InitFlags(flagset)
8890

deployment/base/gc/gc.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,6 @@ spec:
2121
imagePullPolicy: Always
2222
command:
2323
- "nfd-gc"
24+
ports:
25+
- name: metrics
26+
containerPort: 8081

deployment/helm/node-feature-discovery/templates/nfd-gc.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ spec:
5858
drop: [ "ALL" ]
5959
readOnlyRootFilesystem: true
6060
runAsNonRoot: true
61+
ports:
62+
- name: metrics
63+
containerPort: {{ .Values.gc.metricsPort | default "8081"}}
6164

6265
{{- with .Values.gc.nodeSelector }}
6366
nodeSelector:

deployment/helm/node-feature-discovery/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,8 @@ gc:
495495
# cpu: 100m
496496
# memory: 128Mi
497497

498+
metricsPort: 8081
499+
498500
nodeSelector: {}
499501
tolerations: []
500502
annotations: {}

docs/deployment/helm.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ API's you need to install the prometheus operator in your cluster.
203203
| `gc.interval` | string | 1h | Time between periodic garbage collector runs
204204
| `gc.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings
205205
| `gc.resources` | dict | {} | Garbage collector pod [resources management](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/)
206+
| `gc.metricsPort` | integer | 8081 | Port on which to serve Prometheus metrics
206207
| `gc.nodeSelector` | dict | {} | Garbage collector pod [node selector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector)
207208
| `gc.tolerations` | dict | {} | Garbage collector pod [node tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
208209
| `gc.annotations` | dict | {} | Garbage collector pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/)

docs/deployment/metrics.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ The exposed metrics are
1717
| ------------------------------------------------- | --------- | ---------------------------------------
1818
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
1919
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
20+
| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built
2021
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built
2122
| `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC
2223
| `nfd_node_updates_total` | Counter | Number of nodes updated
@@ -28,6 +29,8 @@ The exposed metrics are
2829
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
2930
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
3031
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
32+
| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected.
33+
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects.
3134

3235
## Via Kustomize
3336

pkg/nfd-gc/metrics.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
Copyright 2023 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package nfdgarbagecollector
18+
19+
import (
20+
"github.com/prometheus/client_golang/prometheus"
21+
"sigs.k8s.io/node-feature-discovery/pkg/version"
22+
)
23+
24+
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
25+
const (
26+
buildInfoQuery = "nfd_gc_build_info"
27+
objectsDeletedQuery = "nfd_gc_objects_deleted_total"
28+
objectDeleteErrorsQuery = "nfd_gc_object_delete_failures_total"
29+
)
30+
31+
var (
32+
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
33+
Name: buildInfoQuery,
34+
Help: "Version from which Node Feature Discovery was built.",
35+
ConstLabels: map[string]string{
36+
"version": version.Get(),
37+
},
38+
})
39+
objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{
40+
Name: objectsDeletedQuery,
41+
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."},
42+
[]string{"kind"},
43+
)
44+
objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
45+
Name: objectDeleteErrorsQuery,
46+
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."},
47+
[]string{"kind"},
48+
)
49+
)
50+
51+
// registerVersion exposes the Operator build version.
52+
func registerVersion(version string) {
53+
buildInfo.SetToCurrentTime()
54+
}

pkg/nfd-gc/nfd-gc.go

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,15 @@ import (
3434
"sigs.k8s.io/node-feature-discovery/pkg/apihelper"
3535
nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1"
3636
nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned"
37+
"sigs.k8s.io/node-feature-discovery/pkg/utils"
38+
"sigs.k8s.io/node-feature-discovery/pkg/version"
3739
)
3840

3941
// Args are the command line arguments
4042
type Args struct {
41-
GCPeriod time.Duration
42-
43-
Kubeconfig string
43+
GCPeriod time.Duration
44+
Kubeconfig string
45+
MetricsPort int
4446
}
4547

4648
type NfdGarbageCollector interface {
@@ -74,29 +76,35 @@ func New(args *Args) (NfdGarbageCollector, error) {
7476
}
7577

7678
func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) {
79+
kind := "NodeFeature"
7780
if err := n.nfdClient.NfdV1alpha1().NodeFeatures(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil {
7881
if errors.IsNotFound(err) {
7982
klog.V(2).InfoS("NodeFeature not found, omitting deletion", "nodefeature", klog.KRef(namespace, name))
8083
return
8184
} else {
8285
klog.ErrorS(err, "failed to delete NodeFeature object", "nodefeature", klog.KRef(namespace, name))
86+
objectDeleteErrors.WithLabelValues(kind).Inc()
8387
return
8488
}
8589
}
8690
klog.InfoS("NodeFeature object has been deleted", "nodefeature", klog.KRef(namespace, name))
91+
objectsDeleted.WithLabelValues(kind).Inc()
8792
}
8893

8994
func (n *nfdGarbageCollector) deleteNRT(nodeName string) {
95+
kind := "NodeResourceTopology"
9096
if err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().Delete(context.TODO(), nodeName, metav1.DeleteOptions{}); err != nil {
9197
if errors.IsNotFound(err) {
9298
klog.V(2).InfoS("NodeResourceTopology not found, omitting deletion", "nodeName", nodeName)
9399
return
94100
} else {
95101
klog.ErrorS(err, "failed to delete NodeResourceTopology object", "nodeName", nodeName)
102+
objectDeleteErrors.WithLabelValues(kind).Inc()
96103
return
97104
}
98105
}
99106
klog.InfoS("NodeResourceTopology object has been deleted", "nodeName", nodeName)
107+
objectsDeleted.WithLabelValues(kind).Inc()
100108
}
101109

102110
func (n *nfdGarbageCollector) deleteNodeHandler(object interface{}) {
@@ -208,6 +216,16 @@ func (n *nfdGarbageCollector) startNodeInformer() error {
208216

209217
// Run is a blocking function that removes stale NRT objects when Node is deleted and runs periodic GC to make sure any obsolete objects are removed
210218
func (n *nfdGarbageCollector) Run() error {
219+
if n.args.MetricsPort > 0 {
220+
m := utils.CreateMetricsServer(n.args.MetricsPort,
221+
buildInfo,
222+
objectsDeleted,
223+
objectDeleteErrors)
224+
go m.Run()
225+
registerVersion(version.Get())
226+
defer m.Stop()
227+
}
228+
211229
if err := n.startNodeInformer(); err != nil {
212230
return err
213231
}

0 commit comments

Comments
 (0)