Skip to content

Commit 79706ae

Browse files
committed
Add MIG config support when MIG-backed vGPU config
This change adds MIG configuration support via nvidia-mig-parted when MIG-backed vGPU types are included in the selected vGPU config, before the vGPU configuration takes place. Specifically: - Add CLI flags for MIG configuration options. - Include the nvidia-mig-parted binary in the container image. - Parse vGPU config to detect MIG requirements, convert the vGPU config to the respective MIG config and configure MIG before vGPUs via nvidia-mig-parted. - nvidia-mig-parted requires NVML. The NVIDIA driver library path is searched and used with the `LD_PRELOAD` env var when running nvidia-mig-parted commands. When NVML is not available, skip MIG configuration and proceed to vGPU configuration. This ensures that the vGPU Device Manager is backwards compatible with components that do not make NVML available to it. Signed-off-by: Michail Resvanis <mresvani@redhat.com>
1 parent 5f37569 commit 79706ae

File tree

107 files changed

+54308
-458
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+54308
-458
lines changed

api/spec/v1/spec.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ import (
2020
"encoding/json"
2121
"fmt"
2222

23+
migpartedv1 "github.com/NVIDIA/mig-parted/api/spec/v1"
24+
migtypes "github.com/NVIDIA/mig-parted/pkg/types"
25+
2326
"github.com/NVIDIA/vgpu-device-manager/pkg/types"
2427
)
2528

@@ -160,6 +163,44 @@ func (s *VGPUConfigSpec) UnmarshalJSON(b []byte) error {
160163
return nil
161164
}
162165

166+
func (s VGPUConfigSpecSlice) ToMigConfigSpecSlice() (migpartedv1.MigConfigSpecSlice, error) {
167+
var migConfigSpecs migpartedv1.MigConfigSpecSlice
168+
169+
for _, vgpuSpec := range s {
170+
migSpec := migpartedv1.MigConfigSpec{
171+
DeviceFilter: vgpuSpec.DeviceFilter,
172+
Devices: vgpuSpec.Devices,
173+
MigDevices: make(migtypes.MigConfig),
174+
}
175+
176+
migEnabled := false
177+
for vgpuType := range vgpuSpec.VGPUDevices {
178+
vgpu, err := types.ParseVGPUType(vgpuType)
179+
if err != nil {
180+
return nil, fmt.Errorf("failed to parse vGPU type %s: %w", vgpuType, err)
181+
}
182+
183+
if vgpu.G > 0 {
184+
migEnabled = true
185+
migProfile := fmt.Sprintf("%dg.%dgb", vgpu.G, vgpu.GB)
186+
for _, attr := range vgpu.Attr {
187+
if attr == types.AttributeMediaExtensions {
188+
migProfile += ".me"
189+
break
190+
}
191+
}
192+
migSpec.MigDevices[migProfile] = vgpuSpec.VGPUDevices[vgpuType]
193+
}
194+
}
195+
196+
migSpec.MigEnabled = migEnabled
197+
198+
migConfigSpecs = append(migConfigSpecs, migSpec)
199+
}
200+
201+
return migConfigSpecs, nil
202+
}
203+
163204
func containsKey(m map[string]json.RawMessage, s string) bool {
164205
_, exists := m[s]
165206
return exists

api/spec/v1/spec_test.go

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ import (
2121

2222
"github.com/stretchr/testify/require"
2323
"sigs.k8s.io/yaml"
24+
25+
migpartedv1 "github.com/NVIDIA/mig-parted/api/spec/v1"
26+
migtypes "github.com/NVIDIA/mig-parted/pkg/types"
27+
28+
"github.com/NVIDIA/vgpu-device-manager/pkg/types"
2429
)
2530

2631
func TestSpec(t *testing.T) {
@@ -230,3 +235,190 @@ func TestVGPUConfigSpec(t *testing.T) {
230235
}
231236

232237
}
238+
239+
func TestVGPUConfigSpecSliceToMigConfigSpecSlice(t *testing.T) {
240+
testCases := []struct {
241+
Description string
242+
VGPUConfigSpecSlice VGPUConfigSpecSlice
243+
ExpectedMigConfigSpec migpartedv1.MigConfigSpecSlice
244+
ExpectedError string
245+
}{
246+
{
247+
"Empty slice",
248+
VGPUConfigSpecSlice{},
249+
nil,
250+
"",
251+
},
252+
{
253+
"Single MIG-backed vGPU type",
254+
VGPUConfigSpecSlice{
255+
{
256+
DeviceFilter: "MODEL",
257+
Devices: "all",
258+
VGPUDevices: types.VGPUConfig{
259+
"A100-1-5C": 4,
260+
},
261+
},
262+
},
263+
migpartedv1.MigConfigSpecSlice{
264+
{
265+
DeviceFilter: "MODEL",
266+
Devices: "all",
267+
MigEnabled: true,
268+
MigDevices: migtypes.MigConfig{
269+
"1g.5gb": 4,
270+
},
271+
},
272+
},
273+
"",
274+
},
275+
{
276+
"Multiple MIG-backed vGPU types",
277+
VGPUConfigSpecSlice{
278+
{
279+
DeviceFilter: []string{"MODEL1", "MODEL2"},
280+
Devices: []int{0, 1},
281+
VGPUDevices: types.VGPUConfig{
282+
"A100-1-5C": 2,
283+
"A100-2-10C": 1,
284+
},
285+
},
286+
},
287+
migpartedv1.MigConfigSpecSlice{
288+
{
289+
DeviceFilter: []string{"MODEL1", "MODEL2"},
290+
Devices: []int{0, 1},
291+
MigEnabled: true,
292+
MigDevices: migtypes.MigConfig{
293+
"1g.5gb": 2,
294+
"2g.10gb": 1,
295+
},
296+
},
297+
},
298+
"",
299+
},
300+
{
301+
"MIG-backed vGPU type with media extensions",
302+
VGPUConfigSpecSlice{
303+
{
304+
Devices: "all",
305+
VGPUDevices: types.VGPUConfig{
306+
"A100-1-5CME": 2,
307+
},
308+
},
309+
},
310+
migpartedv1.MigConfigSpecSlice{
311+
{
312+
Devices: "all",
313+
MigEnabled: true,
314+
MigDevices: migtypes.MigConfig{
315+
"1g.5gb.me": 2,
316+
},
317+
},
318+
},
319+
"",
320+
},
321+
{
322+
"Non-MIG vGPU type",
323+
VGPUConfigSpecSlice{
324+
{
325+
Devices: "all",
326+
VGPUDevices: types.VGPUConfig{
327+
"A100-40C": 2,
328+
},
329+
},
330+
},
331+
migpartedv1.MigConfigSpecSlice{
332+
{
333+
Devices: "all",
334+
MigEnabled: false,
335+
MigDevices: migtypes.MigConfig{},
336+
},
337+
},
338+
"",
339+
},
340+
{
341+
"Mixed MIG and non-MIG vGPU types",
342+
VGPUConfigSpecSlice{
343+
{
344+
Devices: "all",
345+
VGPUDevices: types.VGPUConfig{
346+
"A100-40C": 1,
347+
"A100-1-5C": 2,
348+
},
349+
},
350+
},
351+
migpartedv1.MigConfigSpecSlice{
352+
{
353+
Devices: "all",
354+
MigEnabled: true,
355+
MigDevices: migtypes.MigConfig{
356+
"1g.5gb": 2,
357+
},
358+
},
359+
},
360+
"",
361+
},
362+
{
363+
"Multiple specs with different configurations",
364+
VGPUConfigSpecSlice{
365+
{
366+
Devices: "all",
367+
VGPUDevices: types.VGPUConfig{
368+
"A100-1-5C": 4,
369+
},
370+
},
371+
{
372+
DeviceFilter: "MODEL",
373+
Devices: []int{0, 1},
374+
VGPUDevices: types.VGPUConfig{
375+
"A100-40C": 1,
376+
},
377+
},
378+
},
379+
migpartedv1.MigConfigSpecSlice{
380+
{
381+
Devices: "all",
382+
MigEnabled: true,
383+
MigDevices: migtypes.MigConfig{
384+
"1g.5gb": 4,
385+
},
386+
},
387+
{
388+
DeviceFilter: "MODEL",
389+
Devices: []int{0, 1},
390+
MigEnabled: false,
391+
MigDevices: migtypes.MigConfig{},
392+
},
393+
},
394+
"",
395+
},
396+
{
397+
"Invalid vGPU type",
398+
VGPUConfigSpecSlice{
399+
{
400+
Devices: "all",
401+
VGPUDevices: types.VGPUConfig{
402+
"InvalidType": 1,
403+
},
404+
},
405+
},
406+
nil,
407+
"failed to parse vGPU type InvalidType:",
408+
},
409+
}
410+
411+
for _, tc := range testCases {
412+
t.Run(tc.Description, func(t *testing.T) {
413+
result, err := tc.VGPUConfigSpecSlice.ToMigConfigSpecSlice()
414+
if tc.ExpectedError != "" {
415+
require.NotNil(t, err, "Expected failure but got success")
416+
require.Nil(t, result, "Expected nil result on failure")
417+
require.ErrorContains(t, err, tc.ExpectedError)
418+
} else {
419+
require.Nil(t, err, "Unexpected failure: %v", err)
420+
require.Equal(t, tc.ExpectedMigConfigSpec, result, "Unexpected result")
421+
}
422+
})
423+
}
424+
}

0 commit comments

Comments
 (0)