Skip to content

Commit 1ceaafa

Browse files
Shiva Kumarshivakunv
authored andcommitted
vgpu-manager: enable kernel module configuration via KernelModuleConfig
Signed-off-by: Shiva Kumar (SW-CLOUD) <[email protected]>
1 parent 8ca5c55 commit 1ceaafa

File tree

10 files changed

+107
-22
lines changed

10 files changed

+107
-22
lines changed

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,11 @@ type VGPUManagerSpec struct {
651651

652652
// DriverManager represents configuration for NVIDIA Driver Manager initContainer
653653
DriverManager DriverManagerSpec `json:"driverManager,omitempty"`
654+
655+
// Optional: Kernel module configuration parameters for the vGPU manager
656+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
657+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Kernel module configuration parameters for the vGPU manager"
658+
KernelModuleConfig *KernelModuleConfigSpec `json:"kernelModuleConfig,omitempty"`
654659
}
655660

656661
// ToolkitSpec defines the properties for NVIDIA Container Toolkit deployment

api/nvidia/v1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/nvidia.com_clusterpolicies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2310,6 +2310,13 @@ spec:
23102310
items:
23112311
type: string
23122312
type: array
2313+
kernelModuleConfig:
2314+
description: 'Optional: Kernel module configuration parameters
2315+
for the vGPU manager'
2316+
properties:
2317+
name:
2318+
type: string
2319+
type: object
23132320
repository:
23142321
description: NVIDIA vGPU Manager image repository
23152322
type: string

config/crd/bases/nvidia.com_clusterpolicies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2310,6 +2310,13 @@ spec:
23102310
items:
23112311
type: string
23122312
type: array
2313+
kernelModuleConfig:
2314+
description: 'Optional: Kernel module configuration parameters
2315+
for the vGPU manager'
2316+
properties:
2317+
name:
2318+
type: string
2319+
type: object
23132320
repository:
23142321
description: NVIDIA vGPU Manager image repository
23152322
type: string

config/samples/v1_clusterpolicy.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ spec:
101101

102102
vgpuManager:
103103
enabled: true
104+
# kernel module configuration for vGPU manager
105+
kernelModuleConfig:
106+
name: ""
104107

105108
vgpuDeviceManager:
106109
enabled: true

controllers/object_controls.go

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,10 @@ const (
175175
DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH"
176176
// NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime
177177
NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT"
178+
179+
// driversDir is the name of the directory used by the driver-container to represent the path
180+
// of the drivers directory mounted in the container
181+
driversDir = "/drivers"
178182
)
179183

180184
// ContainerProbe defines container probe types
@@ -2811,13 +2815,21 @@ func transformPeerMemoryContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPo
28112815
if config.Driver.KernelModuleConfig != nil && config.Driver.KernelModuleConfig.Name != "" {
28122816
// note: transformDriverContainer() will have already created a Volume backed by the ConfigMap.
28132817
// Only add a VolumeMount for nvidia-peermem-ctr.
2814-
destinationDir := "/drivers"
2815-
volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, destinationDir)
2818+
volumeMounts, _, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, driversDir)
28162819
if err != nil {
28172820
return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for kernel module configuration: %v", err)
28182821
}
28192822
obj.Spec.Template.Spec.Containers[i].VolumeMounts = append(obj.Spec.Template.Spec.Containers[i].VolumeMounts, volumeMounts...)
28202823
}
2824+
if config.VGPUManager.KernelModuleConfig != nil && config.VGPUManager.KernelModuleConfig.Name != "" {
2825+
// note: transformVGPUManagerContainer() will have already created a Volume backed by the ConfigMap.
2826+
// Only add a VolumeMount for nvidia-vgpu-manager-ctr.
2827+
volumeMounts, _, err := createConfigMapVolumeMounts(n, config.VGPUManager.KernelModuleConfig.Name, driversDir)
2828+
if err != nil {
2829+
return fmt.Errorf("failed to create ConfigMap VolumeMounts for vGPU manager kernel module configuration: %w", err)
2830+
}
2831+
obj.Spec.Template.Spec.Containers[i].VolumeMounts = append(obj.Spec.Template.Spec.Containers[i].VolumeMounts, volumeMounts...)
2832+
}
28212833
if config.Driver.Resources != nil {
28222834
obj.Spec.Template.Spec.Containers[i].Resources = corev1.ResourceRequirements{
28232835
Requests: config.Driver.Resources.Requests,
@@ -3481,8 +3493,7 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
34813493

34823494
// mount any custom kernel module configuration parameters at /drivers
34833495
if config.Driver.KernelModuleConfig != nil && config.Driver.KernelModuleConfig.Name != "" {
3484-
destinationDir := "/drivers"
3485-
volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, destinationDir)
3496+
volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.Driver.KernelModuleConfig.Name, driversDir)
34863497
if err != nil {
34873498
return fmt.Errorf("ERROR: failed to create ConfigMap VolumeMounts for kernel module configuration: %v", err)
34883499
}
@@ -3613,6 +3624,7 @@ func createSecretEnvReference(ctx context.Context, ctrlClient client.Client, sec
36133624
}
36143625

36153626
func transformVGPUManagerContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
3627+
podSpec := &obj.Spec.Template.Spec
36163628
container := findContainerByName(obj.Spec.Template.Spec.Containers, "nvidia-vgpu-manager-ctr")
36173629

36183630
if container == nil {
@@ -3660,6 +3672,16 @@ func transformVGPUManagerContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterP
36603672
}
36613673
}
36623674

3675+
// mount any custom kernel module configuration parameters at /drivers
3676+
if config.VGPUManager.KernelModuleConfig != nil && config.VGPUManager.KernelModuleConfig.Name != "" {
3677+
volumeMounts, itemsToInclude, err := createConfigMapVolumeMounts(n, config.VGPUManager.KernelModuleConfig.Name, driversDir)
3678+
if err != nil {
3679+
return fmt.Errorf("failed to create ConfigMap VolumeMounts for kernel module configuration: %w", err)
3680+
}
3681+
container.VolumeMounts = append(container.VolumeMounts, volumeMounts...)
3682+
podSpec.Volumes = append(podSpec.Volumes, createConfigMapVolume(config.VGPUManager.KernelModuleConfig.Name, itemsToInclude))
3683+
}
3684+
36633685
return nil
36643686
}
36653687

controllers/object_controls_test.go

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -104,15 +104,16 @@ var kubernetesResources = []client.Object{
104104
}
105105

106106
type commonDaemonsetSpec struct {
107-
repository string
108-
image string
109-
version string
110-
imagePullPolicy string
111-
imagePullSecrets []corev1.LocalObjectReference
112-
args []string
113-
env []gpuv1.EnvVar
114-
resources *gpuv1.ResourceRequirements
115-
startupProbe *gpuv1.ContainerProbeSpec
107+
repository string
108+
image string
109+
version string
110+
imagePullPolicy string
111+
imagePullSecrets []corev1.LocalObjectReference
112+
args []string
113+
env []gpuv1.EnvVar
114+
resources *gpuv1.ResourceRequirements
115+
startupProbe *gpuv1.ContainerProbeSpec
116+
kernelModuleConfig *gpuv1.KernelModuleConfigSpec
116117
}
117118

118119
func TestMain(m *testing.M) {
@@ -371,14 +372,15 @@ func testDaemonsetCommon(t *testing.T, cp *gpuv1.ClusterPolicy, component string
371372
}
372373
case "VGPUManager":
373374
spec = commonDaemonsetSpec{
374-
repository: cp.Spec.VGPUManager.Repository,
375-
image: cp.Spec.VGPUManager.Image,
376-
version: cp.Spec.VGPUManager.Version,
377-
imagePullPolicy: cp.Spec.VGPUManager.ImagePullPolicy,
378-
imagePullSecrets: getImagePullSecrets(cp.Spec.VGPUManager.ImagePullSecrets),
379-
args: cp.Spec.VGPUManager.Args,
380-
env: cp.Spec.VGPUManager.Env,
381-
resources: cp.Spec.VGPUManager.Resources,
375+
repository: cp.Spec.VGPUManager.Repository,
376+
image: cp.Spec.VGPUManager.Image,
377+
version: cp.Spec.VGPUManager.Version,
378+
imagePullPolicy: cp.Spec.VGPUManager.ImagePullPolicy,
379+
imagePullSecrets: getImagePullSecrets(cp.Spec.VGPUManager.ImagePullSecrets),
380+
args: cp.Spec.VGPUManager.Args,
381+
env: cp.Spec.VGPUManager.Env,
382+
resources: cp.Spec.VGPUManager.Resources,
383+
kernelModuleConfig: cp.Spec.VGPUManager.KernelModuleConfig,
382384
}
383385
dsLabel = "nvidia-vgpu-manager-daemonset"
384386
mainCtrName = "nvidia-vgpu-manager-ctr"
@@ -765,7 +767,7 @@ func getVGPUManagerTestInput(testCase string) *gpuv1.ClusterPolicy {
765767
cp.Spec.VGPUManager.ImagePullSecrets = []string{"ngc-secret"}
766768
cp.Spec.VGPUManager.DriverManager.ImagePullSecrets = []string{"ngc-secret"}
767769
clusterPolicyController.sandboxEnabled = true
768-
770+
cp.Spec.VGPUManager.KernelModuleConfig = &gpuv1.KernelModuleConfigSpec{Name: "vgpu-manager-kernel-module-config"}
769771
switch testCase {
770772
case "default":
771773
// Do nothing
@@ -785,6 +787,7 @@ func getVGPUManagerTestOutput(testCase string) map[string]interface{} {
785787
"driverImage": "nvcr.io/nvidia/vgpu-manager:470.57.02-ubuntu22.04",
786788
"driverManagerImage": "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.3.0",
787789
"imagePullSecret": "ngc-secret",
790+
"kernelModuleConfig": "vgpu-manager-kernel-module-config",
788791
}
789792

790793
switch testCase {
@@ -814,6 +817,23 @@ func TestVGPUManager(t *testing.T) {
814817

815818
for _, tc := range testCases {
816819
t.Run(tc.description, func(t *testing.T) {
820+
// Create the kernel module ConfigMap
821+
if tc.clusterPolicy.Spec.VGPUManager.KernelModuleConfig != nil && tc.clusterPolicy.Spec.VGPUManager.KernelModuleConfig.Name != "" {
822+
cm := &corev1.ConfigMap{
823+
ObjectMeta: metav1.ObjectMeta{
824+
Name: tc.clusterPolicy.Spec.VGPUManager.KernelModuleConfig.Name,
825+
Namespace: clusterPolicyController.operatorNamespace,
826+
},
827+
Data: map[string]string{
828+
"nvidia.conf": "# Test vGPU manager kernel module configuration\n",
829+
},
830+
}
831+
err := clusterPolicyController.client.Create(clusterPolicyController.ctx, cm)
832+
if err != nil {
833+
t.Fatalf("error creating kernel module ConfigMap: %v", err)
834+
}
835+
}
836+
817837
ds, err := testDaemonsetCommon(t, tc.clusterPolicy, "VGPUManager", tc.output["numDaemonsets"].(int))
818838
if err != nil {
819839
t.Fatalf("error in testDaemonsetCommon(): %v", err)
@@ -850,6 +870,9 @@ func TestVGPUManager(t *testing.T) {
850870
}
851871

852872
func TestVGPUManagerAssets(t *testing.T) {
873+
// Clear any KernelModuleConfig that might have been set by previous tests to avoid missing ConfigMap errors
874+
clusterPolicyController.singleton.Spec.VGPUManager.KernelModuleConfig = nil
875+
853876
manifestPath := filepath.Join(cfg.root, vGPUManagerAssetsPath)
854877
// add manifests
855878
addState(&clusterPolicyController, manifestPath)

deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2310,6 +2310,13 @@ spec:
23102310
items:
23112311
type: string
23122312
type: array
2313+
kernelModuleConfig:
2314+
description: 'Optional: Kernel module configuration parameters
2315+
for the vGPU manager'
2316+
properties:
2317+
name:
2318+
type: string
2319+
type: object
23132320
repository:
23142321
description: NVIDIA vGPU Manager image repository
23152322
type: string

deployments/gpu-operator/templates/clusterpolicy.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,9 @@ spec:
276276
{{- if .Values.vgpuManager.args }}
277277
args: {{ toYaml .Values.vgpuManager.args | nindent 6 }}
278278
{{- end }}
279+
{{- if .Values.vgpuManager.kernelModuleConfig }}
280+
kernelModuleConfig: {{ toYaml .Values.vgpuManager.kernelModuleConfig | nindent 6 }}
281+
{{- end }}
279282
driverManager:
280283
{{- if .Values.vgpuManager.driverManager.repository }}
281284
repository: {{ .Values.vgpuManager.driverManager.repository }}

deployments/gpu-operator/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,9 @@ vgpuManager:
430430
version: v0.9.1
431431
imagePullPolicy: IfNotPresent
432432
env: []
433+
# kernel module configuration for vGPU manager
434+
kernelModuleConfig:
435+
name: ""
433436

434437
vgpuDeviceManager:
435438
enabled: true

0 commit comments

Comments
 (0)