Skip to content

Commit a4f514a

Browse files
Merge pull request #2530 from yevgeny-shnaidman/yevgeny/adding-configmap
MGMT-19498: Adding accelerators configuration file to the node-exporter
2 parents a5be61c + b80c418 commit a4f514a

File tree

5 files changed

+108
-5
lines changed

5 files changed

+108
-5
lines changed
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: v1
2+
data:
3+
config.yaml: |-
4+
- "models":
5+
- "pciID": "0x20b5"
6+
"pciName": "A100"
7+
- "pciID": "0x2230"
8+
"pciName": "RTX_A6000"
9+
- "pciID": "0x2717"
10+
"pciName": "RTX_4090"
11+
- "pciID": "0x2235"
12+
"pciName": "A40"
13+
- "pciID": "0x1df5"
14+
"pciName": "V100"
15+
- "pciID": "0x20f1"
16+
"pciName": "A100 40G"
17+
- "pciID": "0x1ff2"
18+
"pciName": "T400 4GB"
19+
- "pciID": "0x1eb8"
20+
"pciName": "Tesla T4"
21+
"vendorID": "0x10de"
22+
"vendorName": "NVIDIA"
23+
kind: ConfigMap
24+
metadata:
25+
labels:
26+
app.kubernetes.io/managed-by: cluster-monitoring-operator
27+
app.kubernetes.io/part-of: openshift-monitoring
28+
name: node-exporter-accelerators-collector-config
29+
namespace: openshift-monitoring

assets/node-exporter/daemonset.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ spec:
7979
- mountPath: /var/node_exporter/textfile
8080
name: node-exporter-textfile
8181
readOnly: true
82+
- mountPath: /var/node_exporter/accelerators_collector_config
83+
name: node-exporter-accelerators-collector-config
84+
readOnly: true
8285
workingDir: /var/node_exporter/textfile
8386
- args:
8487
- --secure-listen-address=[$(IP)]:9100
@@ -182,6 +185,12 @@ spec:
182185
- name: node-exporter-kube-rbac-proxy-config
183186
secret:
184187
secretName: node-exporter-kube-rbac-proxy-config
188+
- configMap:
189+
items:
190+
- key: config.yaml
191+
path: config.yaml
192+
name: node-exporter-accelerators-collector-config
193+
name: node-exporter-accelerators-collector-config
185194
updateStrategy:
186195
rollingUpdate:
187196
maxUnavailable: 10%

jsonnet/components/node-exporter.libsonnet

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,26 @@ local textfileVolumeName = 'node-exporter-textfile';
33
local tlsVolumeName = 'node-exporter-tls';
44
local wtmpPath = '/var/log/wtmp';
55
local wtmpVolumeName = 'node-exporter-wtmp';
6+
local configDir = '/var/node_exporter/accelerators_collector_config';
7+
local configVolumeName = 'node-exporter-accelerators-collector-config';
8+
local acceleratorsConfigFileName = 'config.yaml';
9+
local acceleratorsConfigMapName = 'node-exporter-accelerators-collector-config';
10+
local acceleratorsConfigData = [
11+
{
12+
vendorName: 'NVIDIA',
13+
vendorID: '0x10de',
14+
models: [
15+
{ pciID: '0x20b5', pciName: 'A100' },
16+
{ pciID: '0x2230', pciName: 'RTX_A6000' },
17+
{ pciID: '0x2717', pciName: 'RTX_4090' },
18+
{ pciID: '0x2235', pciName: 'A40' },
19+
{ pciID: '0x1df5', pciName: 'V100' },
20+
{ pciID: '0x20f1', pciName: 'A100 40G' },
21+
{ pciID: '0x1ff2', pciName: 'T400 4GB' },
22+
{ pciID: '0x1eb8', pciName: 'Tesla T4' },
23+
],
24+
},
25+
];
626

727
local nodeExporter = import 'github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/components/node-exporter.libsonnet';
828
local generateSecret = import '../utils/generate-secret.libsonnet';
@@ -269,11 +289,18 @@ function(params)
269289
exec /bin/node_exporter "$0" "$@"
270290
|||,
271291
],
272-
volumeMounts+: [{
273-
mountPath: textfileDir,
274-
name: textfileVolumeName,
275-
readOnly: true,
276-
}],
292+
volumeMounts+: [
293+
{
294+
mountPath: textfileDir,
295+
name: textfileVolumeName,
296+
readOnly: true,
297+
},
298+
{
299+
mountPath: configDir,
300+
name: configVolumeName,
301+
readOnly: true,
302+
},
303+
],
277304
workingDir: textfileDir,
278305
resources+: {
279306
requests+: {
@@ -324,6 +351,18 @@ function(params)
324351
secretName: 'node-exporter-kube-rbac-proxy-config',
325352
},
326353
},
354+
{
355+
name: configVolumeName,
356+
configMap: {
357+
name: acceleratorsConfigMapName,
358+
items: [
359+
{
360+
key: acceleratorsConfigFileName,
361+
path: acceleratorsConfigFileName,
362+
},
363+
],
364+
},
365+
},
327366
],
328367
securityContext: {},
329368
priorityClassName: 'system-cluster-critical',
@@ -350,4 +389,16 @@ function(params)
350389
],
351390
},
352391
},
392+
393+
acceleratorsCollectorConfigmap: {
394+
apiVersion: 'v1',
395+
kind: 'ConfigMap',
396+
metadata: {
397+
name: acceleratorsConfigMapName,
398+
namespace: cfg.namespace,
399+
},
400+
data: {
401+
[acceleratorsConfigFileName]: std.manifestYamlDoc(acceleratorsConfigData),
402+
},
403+
},
353404
}

pkg/manifests/manifests.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ var (
138138
NodeExporterMinimalServiceMonitor = "node-exporter/minimal-service-monitor.yaml"
139139
NodeExporterPrometheusRule = "node-exporter/prometheus-rule.yaml"
140140
NodeExporterKubeRbacProxySecret = "node-exporter/kube-rbac-proxy-secret.yaml"
141+
NodeExporterAcceleratorsConfigMap = "node-exporter/accelerators-collector-configmap.yaml"
141142

142143
PrometheusK8sClusterRoleBinding = "prometheus-k8s/cluster-role-binding.yaml"
143144
PrometheusK8sRoleBindingConfig = "prometheus-k8s/role-binding-config.yaml"
@@ -1033,6 +1034,10 @@ func (f *Factory) NodeExporterRBACProxySecret() (*v1.Secret, error) {
10331034
return f.NewSecret(f.assets.MustNewAssetSlice(NodeExporterKubeRbacProxySecret))
10341035
}
10351036

1037+
func (f *Factory) NodeExporterAcceleratorsCollectorConfigMap() (*v1.ConfigMap, error) {
1038+
return f.NewConfigMap(f.assets.MustNewAssetSlice(NodeExporterAcceleratorsConfigMap))
1039+
}
1040+
10361041
func (f *Factory) PrometheusK8sClusterRoleBinding() (*rbacv1.ClusterRoleBinding, error) {
10371042
return f.NewClusterRoleBinding(f.assets.MustNewAssetSlice(PrometheusK8sClusterRoleBinding))
10381043
}

pkg/tasks/nodeexporter.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,15 @@ func (t *NodeExporterTask) Run(ctx context.Context) error {
9494
return fmt.Errorf("reconciling node-exporter Service failed: %w", err)
9595
}
9696

97+
cm, err := t.factory.NodeExporterAcceleratorsCollectorConfigMap()
98+
if err != nil {
99+
return fmt.Errorf("initializing node-exporter accelerators collector ConfigMap failed: %w", err)
100+
}
101+
err = t.client.CreateOrUpdateConfigMap(ctx, cm)
102+
if err != nil {
103+
return fmt.Errorf("reconciling node-exporter accelerators collector ConfigMap failed: %w", err)
104+
}
105+
97106
ds, err := t.factory.NodeExporterDaemonSet()
98107
if err != nil {
99108
return fmt.Errorf("initializing node-exporter DaemonSet failed: %w", err)

0 commit comments

Comments
 (0)