Skip to content

Commit 561c82f

Browse files
authored
Merge pull request #1894 from faganihajizada/feature/dcgm-hpc-job-mapping
2 parents e8061d2 + e06212b commit 561c82f

File tree

9 files changed

+184
-0
lines changed

9 files changed

+184
-0
lines changed

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ import (
3434

3535
const (
3636
ClusterPolicyCRDName = "ClusterPolicy"
37+
// DefaultDCGMJobMappingDir is the default directory for DCGM Exporter HPC job mapping files
38+
DefaultDCGMJobMappingDir = "/var/lib/dcgm-exporter/job-mapping"
3739
)
3840

3941
// ClusterPolicySpec defines the desired state of ClusterPolicy
@@ -928,6 +930,31 @@ type DCGMExporterSpec struct {
928930
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable hostPID for NVIDIA DCGM Exporter"
929931
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
930932
HostPID *bool `json:"hostPID,omitempty"`
933+
934+
// Optional: HPC job mapping configuration for NVIDIA DCGM Exporter
935+
// +kubebuilder:validation:Optional
936+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
937+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HPC Job Mapping Configuration"
938+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced"
939+
HPCJobMapping *DCGMExporterHPCJobMappingConfig `json:"hpcJobMapping,omitempty"`
940+
}
941+
942+
// DCGMExporterHPCJobMappingConfig defines HPC job mapping configuration for NVIDIA DCGM Exporter
943+
type DCGMExporterHPCJobMappingConfig struct {
944+
// Enable HPC job mapping for DCGM Exporter
945+
// +kubebuilder:validation:Optional
946+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
947+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable HPC Job Mapping"
948+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
949+
Enabled *bool `json:"enabled,omitempty"`
950+
951+
// Directory path where HPC job mapping files are created by the workload manager
952+
// Defaults to /var/lib/dcgm-exporter/job-mapping if not specified
953+
// +kubebuilder:validation:Optional
954+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
955+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Job Mapping Directory"
956+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
957+
Directory string `json:"directory,omitempty"`
931958
}
932959

933960
// DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter
@@ -1943,6 +1970,23 @@ func (e *DCGMExporterSpec) IsHostPIDEnabled() bool {
19431970
return *e.HostPID
19441971
}
19451972

1973+
// IsHPCJobMappingEnabled returns true if HPC job mapping is enabled for DCGM Exporter
1974+
func (e *DCGMExporterSpec) IsHPCJobMappingEnabled() bool {
1975+
if e.HPCJobMapping == nil || e.HPCJobMapping.Enabled == nil {
1976+
// default is false if not specified by user
1977+
return false
1978+
}
1979+
return *e.HPCJobMapping.Enabled
1980+
}
1981+
1982+
// GetHPCJobMappingDirectory returns the directory path for HPC job mapping
1983+
func (e *DCGMExporterSpec) GetHPCJobMappingDirectory() string {
1984+
if e.HPCJobMapping == nil {
1985+
return ""
1986+
}
1987+
return e.HPCJobMapping.Directory
1988+
}
1989+
19461990
// IsEnabled returns true if gpu-feature-discovery is enabled(default) through gpu-operator
19471991
func (g *GPUFeatureDiscoverySpec) IsEnabled() bool {
19481992
if g.Enabled == nil {

api/nvidia/v1/zz_generated.deepcopy.go

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/nvidia.com_clusterpolicies.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,19 @@ spec:
343343
description: HostPID allows the DCGM-Exporter daemon set to access
344344
the host's PID namespace
345345
type: boolean
346+
hpcJobMapping:
347+
description: 'Optional: HPC job mapping configuration for NVIDIA
348+
DCGM Exporter'
349+
properties:
350+
directory:
351+
description: |-
352+
Directory path where HPC job mapping files are created by the workload manager
353+
Defaults to /var/lib/dcgm-exporter/job-mapping if not specified
354+
type: string
355+
enabled:
356+
description: Enable HPC job mapping for DCGM Exporter
357+
type: boolean
358+
type: object
346359
image:
347360
description: NVIDIA DCGM Exporter image name
348361
pattern: '[a-zA-Z0-9\-]+'

config/crd/bases/nvidia.com_clusterpolicies.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,19 @@ spec:
343343
description: HostPID allows the DCGM-Exporter daemon set to access
344344
the host's PID namespace
345345
type: boolean
346+
hpcJobMapping:
347+
description: 'Optional: HPC job mapping configuration for NVIDIA
348+
DCGM Exporter'
349+
properties:
350+
directory:
351+
description: |-
352+
Directory path where HPC job mapping files are created by the workload manager
353+
Defaults to /var/lib/dcgm-exporter/job-mapping if not specified
354+
type: string
355+
enabled:
356+
description: Enable HPC job mapping for DCGM Exporter
357+
type: boolean
358+
type: object
346359
image:
347360
description: NVIDIA DCGM Exporter image name
348361
pattern: '[a-zA-Z0-9\-]+'

controllers/object_controls.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,6 +1706,31 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
17061706
obj.Spec.Template.Spec.HostPID = true
17071707
}
17081708

1709+
// configure HPC job mapping if enabled
1710+
if config.DCGMExporter.IsHPCJobMappingEnabled() {
1711+
jobMappingDir := config.DCGMExporter.GetHPCJobMappingDirectory()
1712+
if jobMappingDir == "" {
1713+
jobMappingDir = gpuv1.DefaultDCGMJobMappingDir
1714+
}
1715+
1716+
// set environment variable for DCGM Exporter
1717+
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_HPC_JOB_MAPPING_DIR", jobMappingDir)
1718+
1719+
// add volumeMount to main container
1720+
jobMappingVolMount := corev1.VolumeMount{Name: "hpc-job-mapping", ReadOnly: true, MountPath: jobMappingDir}
1721+
obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, jobMappingVolMount)
1722+
1723+
// add volume
1724+
jobMappingVolumeSource := corev1.VolumeSource{
1725+
HostPath: &corev1.HostPathVolumeSource{
1726+
Path: jobMappingDir,
1727+
Type: ptr.To(corev1.HostPathDirectoryOrCreate),
1728+
},
1729+
}
1730+
jobMappingVol := corev1.Volume{Name: "hpc-job-mapping", VolumeSource: jobMappingVolumeSource}
1731+
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, jobMappingVol)
1732+
}
1733+
17091734
// mount configmap for custom metrics if provided by user
17101735
if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
17111736
metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName}

controllers/transforms_test.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,6 +1326,49 @@ func TestTransformDCGMExporter(t *testing.T) {
13261326
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").
13271327
WithConfigMapVolume("init-config", "nvidia-dcgm-exporter", int32(0700)),
13281328
},
1329+
{
1330+
description: "transform dcgm exporter with HPC job mapping enabled",
1331+
ds: NewDaemonset().
1332+
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
1333+
WithContainer(corev1.Container{Name: "dummy"}),
1334+
cpSpec: &gpuv1.ClusterPolicySpec{
1335+
DCGMExporter: gpuv1.DCGMExporterSpec{
1336+
Repository: "nvcr.io/nvidia/cloud-native",
1337+
Image: "dcgm-exporter",
1338+
Version: "v1.0.0",
1339+
ImagePullPolicy: "IfNotPresent",
1340+
ImagePullSecrets: []string{"pull-secret"},
1341+
Args: []string{"--fail-on-init-error=false"},
1342+
Env: []gpuv1.EnvVar{
1343+
{Name: "foo", Value: "bar"},
1344+
},
1345+
HPCJobMapping: &gpuv1.DCGMExporterHPCJobMappingConfig{
1346+
Enabled: newBoolPtr(true),
1347+
Directory: "/run/nvidia/dcgm-job-mapping",
1348+
},
1349+
},
1350+
DCGM: gpuv1.DCGMSpec{
1351+
Enabled: newBoolPtr(true),
1352+
},
1353+
},
1354+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
1355+
Name: "dcgm-exporter",
1356+
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
1357+
ImagePullPolicy: corev1.PullIfNotPresent,
1358+
Args: []string{"--fail-on-init-error=false"},
1359+
Env: []corev1.EnvVar{
1360+
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
1361+
{Name: "DCGM_HPC_JOB_MAPPING_DIR", Value: "/run/nvidia/dcgm-job-mapping"},
1362+
{Name: "foo", Value: "bar"},
1363+
},
1364+
VolumeMounts: []corev1.VolumeMount{
1365+
{Name: "hpc-job-mapping", ReadOnly: true, MountPath: "/run/nvidia/dcgm-job-mapping"},
1366+
},
1367+
}).WithContainer(corev1.Container{Name: "dummy"}).
1368+
WithPullSecret("pull-secret").
1369+
WithRuntimeClassName("nvidia").
1370+
WithHostPathVolume("hpc-job-mapping", "/run/nvidia/dcgm-job-mapping", ptr.To(corev1.HostPathDirectoryOrCreate)),
1371+
},
13291372
}
13301373

13311374
for _, tc := range testCases {

deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,19 @@ spec:
343343
description: HostPID allows the DCGM-Exporter daemon set to access
344344
the host's PID namespace
345345
type: boolean
346+
hpcJobMapping:
347+
description: 'Optional: HPC job mapping configuration for NVIDIA
348+
DCGM Exporter'
349+
properties:
350+
directory:
351+
description: |-
352+
Directory path where HPC job mapping files are created by the workload manager
353+
Defaults to /var/lib/dcgm-exporter/job-mapping if not specified
354+
type: string
355+
enabled:
356+
description: Enable HPC job mapping for DCGM Exporter
357+
type: boolean
358+
type: object
346359
image:
347360
description: NVIDIA DCGM Exporter image name
348361
pattern: '[a-zA-Z0-9\-]+'

deployments/gpu-operator/templates/clusterpolicy.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,9 @@ spec:
539539
{{- if .Values.dcgmExporter.hostPID }}
540540
hostPID: {{ .Values.dcgmExporter.hostPID }}
541541
{{- end }}
542+
{{- if .Values.dcgmExporter.hpcJobMapping }}
543+
hpcJobMapping: {{ toYaml .Values.dcgmExporter.hpcJobMapping | nindent 6 }}
544+
{{- end }}
542545
gfd:
543546
enabled: {{ .Values.gfd.enabled }}
544547
{{- if .Values.gfd.repository }}

deployments/gpu-operator/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,11 @@ dcgmExporter:
283283
env: []
284284
resources: {}
285285
hostPID: false
286+
# HPC job mapping configuration for correlating GPU metrics with HPC workload manager jobs
287+
# This is used by HPC workload managers like Slurm to label GPU metrics with job IDs
288+
# hpcJobMapping:
289+
# enabled: true
290+
# directory: /var/lib/dcgm-exporter/job-mapping
286291
service:
287292
internalTrafficPolicy: Cluster
288293
serviceMonitor:

0 commit comments

Comments
 (0)