Skip to content

Commit a36470a

Browse files
authored
Merge pull request #1962 from nikhaild/pull-request/dcgm-exporter-hostnetwork
[dcgm-exporter] Support exposing metrics on hostNetwork
2 parents f355b55 + 95255ef commit a36470a

File tree

9 files changed

+176
-0
lines changed

9 files changed

+176
-0
lines changed

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,13 @@ type DCGMExporterSpec struct {
931931
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
932932
HostPID *bool `json:"hostPID,omitempty"`
933933

934+
// HostNetwork allows the DCGM-Exporter daemon set to expose metrics port on the host's network namespace.
935+
// +kubebuilder:validation:Optional
936+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
937+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable hostNetwork for NVIDIA DCGM Exporter"
938+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
939+
HostNetwork *bool `json:"hostNetwork,omitempty"`
940+
934941
// Optional: HPC job mapping configuration for NVIDIA DCGM Exporter
935942
// +kubebuilder:validation:Optional
936943
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
@@ -1970,6 +1977,15 @@ func (e *DCGMExporterSpec) IsHostPIDEnabled() bool {
19701977
return *e.HostPID
19711978
}
19721979

1980+
// IsHostNetworkEnabled returns true if hostNetwork is enabled for DCGM Exporter
1981+
func (e *DCGMExporterSpec) IsHostNetworkEnabled() bool {
1982+
if e.HostNetwork == nil {
1983+
// default is false if not specified by user
1984+
return false
1985+
}
1986+
return *e.HostNetwork
1987+
}
1988+
19731989
// IsHPCJobMappingEnabled returns true if HPC job mapping is enabled for DCGM Exporter
19741990
func (e *DCGMExporterSpec) IsHPCJobMappingEnabled() bool {
19751991
if e.HPCJobMapping == nil || e.HPCJobMapping.Enabled == nil {

api/nvidia/v1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/nvidia.com_clusterpolicies.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ spec:
339339
- name
340340
type: object
341341
type: array
342+
hostNetwork:
343+
description: HostNetwork allows the DCGM-Exporter daemon set to
344+
expose metrics port on the host's network namespace.
345+
type: boolean
342346
hostPID:
343347
description: HostPID allows the DCGM-Exporter daemon set to access
344348
the host's PID namespace

config/crd/bases/nvidia.com_clusterpolicies.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ spec:
339339
- name
340340
type: object
341341
type: array
342+
hostNetwork:
343+
description: HostNetwork allows the DCGM-Exporter daemon set to
344+
expose metrics port on the host's network namespace.
345+
type: boolean
342346
hostPID:
343347
description: HostPID allows the DCGM-Exporter daemon set to access
344348
the host's PID namespace

controllers/object_controls.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1696,8 +1696,15 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
16961696
if remoteEngine != "" && strings.HasPrefix(remoteEngine, "localhost") {
16971697
// enable hostNetwork for communication with external DCGM using localhost
16981698
obj.Spec.Template.Spec.HostNetwork = true
1699+
obj.Spec.Template.Spec.DNSPolicy = corev1.DNSClusterFirstWithHostNet
16991700
}
17001701
}
1702+
// set hostNetwork if specified for DCGM Exporter (if it is already enabled above,
1703+
// do not touch the value)
1704+
if config.DCGMExporter.IsHostNetworkEnabled() {
1705+
obj.Spec.Template.Spec.HostNetwork = true
1706+
obj.Spec.Template.Spec.DNSPolicy = corev1.DNSClusterFirstWithHostNet
1707+
}
17011708

17021709
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
17031710

controllers/transforms_test.go

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,11 @@ func (d Daemonset) WithHostNetwork(enabled bool) Daemonset {
168168
return d
169169
}
170170

171+
func (d Daemonset) WithDNSPolicy(policy corev1.DNSPolicy) Daemonset {
172+
d.Spec.Template.Spec.DNSPolicy = policy
173+
return d
174+
}
175+
171176
func (d Daemonset) WithHostPID(enabled bool) Daemonset {
172177
d.Spec.Template.Spec.HostPID = enabled
173178
return d
@@ -1272,6 +1277,133 @@ func TestTransformDCGMExporter(t *testing.T) {
12721277
},
12731278
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostPID(false),
12741279
},
1280+
{
1281+
description: "transform dcgm exporter with hostNetwork enabled",
1282+
ds: NewDaemonset().
1283+
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
1284+
WithContainer(corev1.Container{Name: "dummy"}),
1285+
cpSpec: &gpuv1.ClusterPolicySpec{
1286+
DCGMExporter: gpuv1.DCGMExporterSpec{
1287+
Repository: "nvcr.io/nvidia/cloud-native",
1288+
Image: "dcgm-exporter",
1289+
Version: "v1.0.0",
1290+
ImagePullPolicy: "IfNotPresent",
1291+
ImagePullSecrets: []string{"pull-secret"},
1292+
Args: []string{"--fail-on-init-error=false"},
1293+
HostNetwork: newBoolPtr(true),
1294+
Env: []gpuv1.EnvVar{
1295+
{Name: "foo", Value: "bar"},
1296+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
1297+
},
1298+
},
1299+
},
1300+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
1301+
Name: "dcgm-exporter",
1302+
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
1303+
ImagePullPolicy: corev1.PullIfNotPresent,
1304+
Args: []string{"--fail-on-init-error=false"},
1305+
Env: []corev1.EnvVar{
1306+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
1307+
{Name: "foo", Value: "bar"},
1308+
},
1309+
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(true).WithDNSPolicy(corev1.DNSClusterFirstWithHostNet),
1310+
},
1311+
{
1312+
description: "transform dcgm exporter with hostNetwork disabled",
1313+
ds: NewDaemonset().
1314+
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
1315+
WithContainer(corev1.Container{Name: "dummy"}),
1316+
cpSpec: &gpuv1.ClusterPolicySpec{
1317+
DCGMExporter: gpuv1.DCGMExporterSpec{
1318+
Repository: "nvcr.io/nvidia/cloud-native",
1319+
Image: "dcgm-exporter",
1320+
Version: "v1.0.0",
1321+
ImagePullPolicy: "IfNotPresent",
1322+
ImagePullSecrets: []string{"pull-secret"},
1323+
Args: []string{"--fail-on-init-error=false"},
1324+
HostNetwork: newBoolPtr(false),
1325+
Env: []gpuv1.EnvVar{
1326+
{Name: "foo", Value: "bar"},
1327+
},
1328+
},
1329+
},
1330+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
1331+
Name: "dcgm-exporter",
1332+
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
1333+
ImagePullPolicy: corev1.PullIfNotPresent,
1334+
Args: []string{"--fail-on-init-error=false"},
1335+
Env: []corev1.EnvVar{
1336+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
1337+
{Name: "foo", Value: "bar"},
1338+
},
1339+
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(false),
1340+
},
1341+
{
1342+
description: "transform dcgm exporter with hostNetwork unspecified",
1343+
ds: NewDaemonset().
1344+
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
1345+
WithContainer(corev1.Container{Name: "dummy"}),
1346+
cpSpec: &gpuv1.ClusterPolicySpec{
1347+
DCGMExporter: gpuv1.DCGMExporterSpec{
1348+
Repository: "nvcr.io/nvidia/cloud-native",
1349+
Image: "dcgm-exporter",
1350+
Version: "v1.0.0",
1351+
ImagePullPolicy: "IfNotPresent",
1352+
ImagePullSecrets: []string{"pull-secret"},
1353+
Args: []string{"--fail-on-init-error=false"},
1354+
Env: []gpuv1.EnvVar{
1355+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"},
1356+
{Name: "foo", Value: "bar"},
1357+
},
1358+
},
1359+
},
1360+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
1361+
Name: "dcgm-exporter",
1362+
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
1363+
ImagePullPolicy: corev1.PullIfNotPresent,
1364+
Args: []string{"--fail-on-init-error=false"},
1365+
Env: []corev1.EnvVar{
1366+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"},
1367+
{Name: "foo", Value: "bar"},
1368+
},
1369+
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(false),
1370+
},
1371+
{
1372+
description: "transform dcgm exporter with dcgm running on the host itself(DGX BaseOS)",
1373+
ds: NewDaemonset().
1374+
WithContainer(corev1.Container{
1375+
Name: "dcgm-exporter",
1376+
Env: []corev1.EnvVar{{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"}},
1377+
}).
1378+
WithContainer(corev1.Container{Name: "dummy"}),
1379+
cpSpec: &gpuv1.ClusterPolicySpec{
1380+
DCGM: gpuv1.DCGMSpec{
1381+
Enabled: newBoolPtr(false),
1382+
},
1383+
DCGMExporter: gpuv1.DCGMExporterSpec{
1384+
Repository: "nvcr.io/nvidia/cloud-native",
1385+
Image: "dcgm-exporter",
1386+
Version: "v1.0.0",
1387+
ImagePullPolicy: "IfNotPresent",
1388+
ImagePullSecrets: []string{"pull-secret"},
1389+
Args: []string{"--fail-on-init-error=false"},
1390+
Env: []gpuv1.EnvVar{
1391+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"},
1392+
{Name: "foo", Value: "bar"},
1393+
},
1394+
},
1395+
},
1396+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
1397+
Name: "dcgm-exporter",
1398+
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
1399+
ImagePullPolicy: corev1.PullIfNotPresent,
1400+
Args: []string{"--fail-on-init-error=false"},
1401+
Env: []corev1.EnvVar{
1402+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"},
1403+
{Name: "foo", Value: "bar"},
1404+
},
1405+
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(true).WithDNSPolicy(corev1.DNSClusterFirstWithHostNet),
1406+
},
12751407
{
12761408
description: "transform dcgm exporter, openshift",
12771409
openshiftVersion: "1.0.0",

deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ spec:
339339
- name
340340
type: object
341341
type: array
342+
hostNetwork:
343+
description: HostNetwork allows the DCGM-Exporter daemon set to
344+
expose metrics port on the host's network namespace.
345+
type: boolean
342346
hostPID:
343347
description: HostPID allows the DCGM-Exporter daemon set to access
344348
the host's PID namespace

deployments/gpu-operator/templates/clusterpolicy.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,9 @@ spec:
539539
{{- if .Values.dcgmExporter.hostPID }}
540540
hostPID: {{ .Values.dcgmExporter.hostPID }}
541541
{{- end }}
542+
{{- if .Values.dcgmExporter.hostNetwork }}
543+
hostNetwork: {{ .Values.dcgmExporter.hostNetwork }}
544+
{{- end }}
542545
{{- if .Values.dcgmExporter.hpcJobMapping }}
543546
hpcJobMapping: {{ toYaml .Values.dcgmExporter.hpcJobMapping | nindent 6 }}
544547
{{- end }}

deployments/gpu-operator/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ dcgmExporter:
283283
env: []
284284
resources: {}
285285
hostPID: false
286+
hostNetwork: false
286287
# HPC job mapping configuration for correlating GPU metrics with HPC workload manager jobs
287288
# This is used by HPC workload managers like Slurm to label GPU metrics with job IDs
288289
# hpcJobMapping:

0 commit comments

Comments
 (0)