Skip to content

Commit d858d24

Browse files
committed
[dcgm-exporter] Support exposing metrics on hostNetwork
Add support to toggle `hostNetwork` field of pod spec for dcgm-exporter daemonset pod spec, allowing dcgm-exporter pods to be scraped by say prometheus-server that runs outside of the bounds of the k8s cluster overlay network (and still be able to reach dcgm-exporter pods). Fixes #1086 Signed-off-by: Nikhil R Deshpande <[email protected]>
1 parent e9d7368 commit d858d24

File tree

9 files changed

+133
-0
lines changed

9 files changed

+133
-0
lines changed

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,13 @@ type DCGMExporterSpec struct {
931931
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
932932
HostPID *bool `json:"hostPID,omitempty"`
933933

934+
// HostNetwork allows the DCGM-Exporter daemon set to expose metrics port on the host's network namespace.
935+
// +kubebuilder:validation:Optional
936+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
937+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable hostNetwork for NVIDIA DCGM Exporter"
938+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
939+
HostNetwork *bool `json:"hostNetwork,omitempty"`
940+
934941
// Optional: HPC job mapping configuration for NVIDIA DCGM Exporter
935942
// +kubebuilder:validation:Optional
936943
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
@@ -1970,6 +1977,15 @@ func (e *DCGMExporterSpec) IsHostPIDEnabled() bool {
19701977
return *e.HostPID
19711978
}
19721979

1980+
// IsHostNetworkEnabled returns true if hostNetwork is enabled for DCGM Exporter
1981+
func (e *DCGMExporterSpec) IsHostNetworkEnabled() bool {
1982+
if e.HostNetwork == nil {
1983+
// default is false if not specified by user
1984+
return false
1985+
}
1986+
return *e.HostNetwork
1987+
}
1988+
19731989
// IsHPCJobMappingEnabled returns true if HPC job mapping is enabled for DCGM Exporter
19741990
func (e *DCGMExporterSpec) IsHPCJobMappingEnabled() bool {
19751991
if e.HPCJobMapping == nil || e.HPCJobMapping.Enabled == nil {

api/nvidia/v1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/nvidia.com_clusterpolicies.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ spec:
339339
- name
340340
type: object
341341
type: array
342+
hostNetwork:
343+
description: HostNetwork allows the DCGM-Exporter daemon set to
344+
expose metrics port on the host's network namespace.
345+
type: boolean
342346
hostPID:
343347
description: HostPID allows the DCGM-Exporter daemon set to access
344348
the host's PID namespace

config/crd/bases/nvidia.com_clusterpolicies.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ spec:
339339
- name
340340
type: object
341341
type: array
342+
hostNetwork:
343+
description: HostNetwork allows the DCGM-Exporter daemon set to
344+
expose metrics port on the host's network namespace.
345+
type: boolean
342346
hostPID:
343347
description: HostPID allows the DCGM-Exporter daemon set to access
344348
the host's PID namespace

controllers/object_controls.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,6 +1698,11 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
16981698
obj.Spec.Template.Spec.HostNetwork = true
16991699
}
17001700
}
1701+
// set hostNetwork if specified for DCGM Exporter (if it is already enabled above,
1702+
// do not touch the value)
1703+
if config.DCGMExporter.IsHostNetworkEnabled() {
1704+
obj.Spec.Template.Spec.HostNetwork = true
1705+
}
17011706

17021707
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
17031708

controllers/transforms_test.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,97 @@ func TestTransformDCGMExporter(t *testing.T) {
12721272
},
12731273
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostPID(false),
12741274
},
1275+
{
1276+
description: "transform dcgm exporter with hostNetwork enabled",
1277+
ds: NewDaemonset().
1278+
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
1279+
WithContainer(corev1.Container{Name: "dummy"}),
1280+
cpSpec: &gpuv1.ClusterPolicySpec{
1281+
DCGMExporter: gpuv1.DCGMExporterSpec{
1282+
Repository: "nvcr.io/nvidia/cloud-native",
1283+
Image: "dcgm-exporter",
1284+
Version: "v1.0.0",
1285+
ImagePullPolicy: "IfNotPresent",
1286+
ImagePullSecrets: []string{"pull-secret"},
1287+
Args: []string{"--fail-on-init-error=false"},
1288+
HostNetwork: newBoolPtr(true),
1289+
Env: []gpuv1.EnvVar{
1290+
{Name: "foo", Value: "bar"},
1291+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
1292+
},
1293+
},
1294+
},
1295+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
1296+
Name: "dcgm-exporter",
1297+
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
1298+
ImagePullPolicy: corev1.PullIfNotPresent,
1299+
Args: []string{"--fail-on-init-error=false"},
1300+
Env: []corev1.EnvVar{
1301+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
1302+
{Name: "foo", Value: "bar"},
1303+
},
1304+
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(true),
1305+
},
1306+
{
1307+
description: "transform dcgm exporter with hostNetwork disabled",
1308+
ds: NewDaemonset().
1309+
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
1310+
WithContainer(corev1.Container{Name: "dummy"}),
1311+
cpSpec: &gpuv1.ClusterPolicySpec{
1312+
DCGMExporter: gpuv1.DCGMExporterSpec{
1313+
Repository: "nvcr.io/nvidia/cloud-native",
1314+
Image: "dcgm-exporter",
1315+
Version: "v1.0.0",
1316+
ImagePullPolicy: "IfNotPresent",
1317+
ImagePullSecrets: []string{"pull-secret"},
1318+
Args: []string{"--fail-on-init-error=false"},
1319+
HostNetwork: newBoolPtr(false),
1320+
Env: []gpuv1.EnvVar{
1321+
{Name: "foo", Value: "bar"},
1322+
},
1323+
},
1324+
},
1325+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
1326+
Name: "dcgm-exporter",
1327+
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
1328+
ImagePullPolicy: corev1.PullIfNotPresent,
1329+
Args: []string{"--fail-on-init-error=false"},
1330+
Env: []corev1.EnvVar{
1331+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
1332+
{Name: "foo", Value: "bar"},
1333+
},
1334+
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(false),
1335+
},
1336+
{
1337+
description: "transform dcgm exporter with hostNetwork unspecified",
1338+
ds: NewDaemonset().
1339+
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
1340+
WithContainer(corev1.Container{Name: "dummy"}),
1341+
cpSpec: &gpuv1.ClusterPolicySpec{
1342+
DCGMExporter: gpuv1.DCGMExporterSpec{
1343+
Repository: "nvcr.io/nvidia/cloud-native",
1344+
Image: "dcgm-exporter",
1345+
Version: "v1.0.0",
1346+
ImagePullPolicy: "IfNotPresent",
1347+
ImagePullSecrets: []string{"pull-secret"},
1348+
Args: []string{"--fail-on-init-error=false"},
1349+
Env: []gpuv1.EnvVar{
1350+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"},
1351+
{Name: "foo", Value: "bar"},
1352+
},
1353+
},
1354+
},
1355+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
1356+
Name: "dcgm-exporter",
1357+
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
1358+
ImagePullPolicy: corev1.PullIfNotPresent,
1359+
Args: []string{"--fail-on-init-error=false"},
1360+
Env: []corev1.EnvVar{
1361+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"},
1362+
{Name: "foo", Value: "bar"},
1363+
},
1364+
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(false),
1365+
},
12751366
{
12761367
description: "transform dcgm exporter, openshift",
12771368
openshiftVersion: "1.0.0",

deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ spec:
339339
- name
340340
type: object
341341
type: array
342+
hostNetwork:
343+
description: HostNetwork allows the DCGM-Exporter daemon set to
344+
expose metrics port on the host's network namespace.
345+
type: boolean
342346
hostPID:
343347
description: HostPID allows the DCGM-Exporter daemon set to access
344348
the host's PID namespace

deployments/gpu-operator/templates/clusterpolicy.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,9 @@ spec:
539539
{{- if .Values.dcgmExporter.hostPID }}
540540
hostPID: {{ .Values.dcgmExporter.hostPID }}
541541
{{- end }}
542+
{{- if .Values.dcgmExporter.hostNetwork }}
543+
hostNetwork: {{ .Values.dcgmExporter.hostNetwork }}
544+
{{- end }}
542545
{{- if .Values.dcgmExporter.hpcJobMapping }}
543546
hpcJobMapping: {{ toYaml .Values.dcgmExporter.hpcJobMapping | nindent 6 }}
544547
{{- end }}

deployments/gpu-operator/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ dcgmExporter:
283283
env: []
284284
resources: {}
285285
hostPID: false
286+
hostNetwork: false
286287
# HPC job mapping configuration for correlating GPU metrics with HPC workload manager jobs
287288
# This is used by HPC workload managers like Slurm to label GPU metrics with job IDs
288289
# hpcJobMapping:

0 commit comments

Comments
 (0)