Skip to content

Commit 3dd1e82

Browse files
Reduce redundant NVML Init/Shutdown cycles
nvidia-mig-manager.service consumes excessive CPU time (~42s on DGX B300 with 8x B200 GPUs) for operations completing in ~9s wall time. The root cause is 68-100 redundant NVML Init/Shutdown cycles per service run. Each nvml.Init() triggers dlopen("libnvidia-ml.so.1") + 24 dlsym() calls to resolve versioned API symbols — expensive on multi-GPU systems. The overhead comes from two compounding patterns: 1. Every method on nvmlMigModeManager and nvmlMigConfigManager independently calls Init()/Shutdown(), despite callers already maintaining an initialized NVML instance at the command level. 2. Callers create new nvml.New() instances inside per-GPU loops, each triggering a full Init/Shutdown cycle including version checks. Fix by: - Accepting nvml.Interface in constructors (aligning real constructors with mock constructors that already accept it) - Removing per-method Init/Shutdown from all 7 manager methods - Hoisting manager creation out of per-device loops to create once per command This reduces NVML Init/Shutdown from ~100 to 1 per command, cutting CPU time by 4.7x and dlsym calls from 572 to 45 (12.7x reduction). Signed-off-by: Rajath Agasthya <ragasthya@nvidia.com>
1 parent 16546dd commit 3dd1e82

File tree

13 files changed

+66
-150
lines changed

13 files changed

+66
-150
lines changed

cmd/nvidia-mig-parted/apply/config.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,17 @@ func ApplyMigConfig(c *Context) error {
3333
}
3434
defer util.TryNvmlShutdown(c.Nvml)
3535

36-
return assert.WalkSelectedMigConfigForEachGPU(c.MigConfig, func(mc *v1.MigConfigSpec, i int, d types.DeviceID) error {
37-
modeManager, err := util.NewMigModeManager()
38-
if err != nil {
39-
return fmt.Errorf("error creating MIG mode Manager: %v", err)
40-
}
36+
modeManager, err := util.NewMigModeManager(c.Nvml)
37+
if err != nil {
38+
return fmt.Errorf("error creating MIG mode Manager: %w", err)
39+
}
40+
41+
configManager, err := util.NewMigConfigManager(c.Nvml)
42+
if err != nil {
43+
return fmt.Errorf("error creating MIG config Manager: %w", err)
44+
}
4145

46+
return assert.WalkSelectedMigConfigForEachGPU(c.MigConfig, func(mc *v1.MigConfigSpec, i int, d types.DeviceID) error {
4247
capable, err := modeManager.IsMigCapable(i)
4348
if err != nil {
4449
return fmt.Errorf("error checking MIG capable: %v", err)
@@ -77,11 +82,6 @@ func ApplyMigConfig(c *Context) error {
7782
return nil
7883
}
7984

80-
configManager, err := util.NewMigConfigManager()
81-
if err != nil {
82-
return fmt.Errorf("error creating MIG config Manager: %v", err)
83-
}
84-
8585
current, err := configManager.GetMigConfig(i)
8686
if err != nil {
8787
return fmt.Errorf("error getting MIGConfig: %v", err)

cmd/nvidia-mig-parted/apply/mode.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,21 @@ func ApplyMigMode(c *Context) error {
4545
return fmt.Errorf("error enumerating GPUs: %v", err)
4646
}
4747

48+
modeManager, err := util.NewMigModeManager(c.Nvml)
49+
if err != nil {
50+
return fmt.Errorf("error creating MIG mode Manager: %w", err)
51+
}
52+
53+
configManager := config.NewNvmlMigConfigManager(c.Nvml)
54+
4855
pending := make([]bool, len(deviceIDs))
4956
err = assert.WalkSelectedMigConfigForEachGPU(c.MigConfig, func(mc *v1.MigConfigSpec, i int, d types.DeviceID) error {
5057
desiredMode := mode.Disabled
5158
if mc.MigEnabled {
5259
desiredMode = mode.Enabled
5360
}
5461

55-
manager, err := util.NewMigModeManager()
56-
if err != nil {
57-
return fmt.Errorf("error creating MIG mode Manager: %v", err)
58-
}
59-
60-
capable, err := manager.IsMigCapable(i)
62+
capable, err := modeManager.IsMigCapable(i)
6163
if err != nil {
6264
return fmt.Errorf("error checking MIG capable: %v", err)
6365
}
@@ -77,28 +79,27 @@ func ApplyMigMode(c *Context) error {
7779
return fmt.Errorf("cannot set MIG mode on non MIG-capable GPU")
7880
}
7981

80-
currentMode, err := manager.GetMigMode(i)
82+
currentMode, err := modeManager.GetMigMode(i)
8183
if err != nil {
8284
return fmt.Errorf("error getting MIG mode: %v", err)
8385
}
8486
log.Debugf(" Current MIG mode: %v", currentMode)
8587

8688
if nvidiaModuleLoaded && currentMode != mode.Disabled {
8789
log.Debugf(" Clearing existing MIG configuration")
88-
manager := config.NewNvmlMigConfigManager()
89-
err := manager.ClearMigConfig(i)
90+
err := configManager.ClearMigConfig(i)
9091
if err != nil {
9192
return fmt.Errorf("error clearing existing MIG configurations: %v", err)
9293
}
9394
}
9495

9596
log.Debugf(" Updating MIG mode: %v", desiredMode)
96-
err = manager.SetMigMode(i, desiredMode)
97+
err = modeManager.SetMigMode(i, desiredMode)
9798
if err != nil {
9899
return fmt.Errorf("error setting MIG mode: %v", err)
99100
}
100101

101-
pending[i], err = manager.IsMigModeChangePending(i)
102+
pending[i], err = modeManager.IsMigModeChangePending(i)
102103
if err != nil {
103104
return fmt.Errorf("error checking pending MIG mode change: %v", err)
104105
}

cmd/nvidia-mig-parted/assert/config.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,18 @@ func AssertMigConfig(c *Context) error {
3737
return fmt.Errorf("error enumerating GPUs: %v", err)
3838
}
3939

40+
modeManager, err := util.NewMigModeManager(c.Nvml)
41+
if err != nil {
42+
return fmt.Errorf("error creating MIG Mode Manager: %w", err)
43+
}
44+
45+
configManager, err := util.NewMigConfigManager(c.Nvml)
46+
if err != nil {
47+
return fmt.Errorf("error creating MIG Config Manager: %w", err)
48+
}
49+
4050
matched := make([]bool, len(deviceIDs))
4151
err = WalkSelectedMigConfigForEachGPU(c.MigConfig, func(mc *v1.MigConfigSpec, i int, d types.DeviceID) error {
42-
modeManager, err := util.NewMigModeManager()
43-
if err != nil {
44-
return fmt.Errorf("error creating MIG Mode Manager: %v", err)
45-
}
46-
4752
capable, err := modeManager.IsMigCapable(i)
4853
if err != nil {
4954
return fmt.Errorf("error checking MIG capable: %v", err)
@@ -64,11 +69,6 @@ func AssertMigConfig(c *Context) error {
6469
return nil
6570
}
6671

67-
configManager, err := util.NewMigConfigManager()
68-
if err != nil {
69-
return fmt.Errorf("error creating MIG Config Manager: %v", err)
70-
}
71-
7272
current, err := configManager.GetMigConfig(i)
7373
if err != nil {
7474
return fmt.Errorf("error getting MIGConfig: %v", err)

cmd/nvidia-mig-parted/assert/mode.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,18 +39,18 @@ func AssertMigMode(c *Context) error {
3939
defer util.TryNvmlShutdown(c.Nvml)
4040
}
4141

42+
manager, err := util.NewMigModeManager(c.Nvml)
43+
if err != nil {
44+
return fmt.Errorf("error creating MIG mode Manager: %w", err)
45+
}
46+
4247
return WalkSelectedMigConfigForEachGPU(c.MigConfig, func(mc *v1.MigConfigSpec, i int, d types.DeviceID) error {
4348
if mc.MigEnabled {
4449
log.Debugf(" Asserting MIG mode: %v", mode.Enabled)
4550
} else {
4651
log.Debugf(" Asserting MIG mode: %v", mode.Disabled)
4752
}
4853

49-
manager, err := util.NewMigModeManager()
50-
if err != nil {
51-
return fmt.Errorf("error creating MIG mode Manager: %v", err)
52-
}
53-
5454
capable, err := manager.IsMigCapable(i)
5555
if err != nil {
5656
return fmt.Errorf("error checking MIG capable: %v", err)

cmd/nvidia-mig-parted/checkpoint/checkpoint.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func checkpointWrapper(c *cli.Context, f *Flags) error {
9494
}
9595
defer util.TryNvmlShutdown(nvml)
9696

97-
migState, err := state.NewMigStateManager().Fetch()
97+
migState, err := state.NewMigStateManager(nvml).Fetch()
9898
if err != nil {
9999
return fmt.Errorf("error fetching MIG state: %v", err)
100100
}

cmd/nvidia-mig-parted/export/config.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,20 @@ func ExportMigConfigs(c *Context) (*v1.Spec, error) {
3838
return nil, fmt.Errorf("error enumerating GPUs: %v", err)
3939
}
4040

41+
modeManager, err := util.NewMigModeManager(c.Nvml)
42+
if err != nil {
43+
return nil, fmt.Errorf("error creating MIG Mode Manager: %w", err)
44+
}
45+
46+
configManager, err := util.NewMigConfigManager(c.Nvml)
47+
if err != nil {
48+
return nil, fmt.Errorf("error creating MIG Config Manager: %w", err)
49+
}
50+
4151
configSpecs := make(v1.MigConfigSpecSlice, len(deviceIDs))
4252
for i, deviceID := range deviceIDs {
4353
deviceFilter := deviceID.String()
4454

45-
modeManager, err := util.NewMigModeManager()
46-
if err != nil {
47-
return nil, fmt.Errorf("error creating MIG Mode Manager: %v", err)
48-
}
49-
5055
enabled := false
5156
capable, err := modeManager.IsMigCapable(i)
5257
if err != nil {
@@ -62,11 +67,6 @@ func ExportMigConfigs(c *Context) (*v1.Spec, error) {
6267

6368
migDevices := types.MigConfig{}
6469
if enabled {
65-
configManager, err := util.NewMigConfigManager()
66-
if err != nil {
67-
return nil, fmt.Errorf("error creating MIG Config Manager: %v", err)
68-
}
69-
7070
migDevices, err = configManager.GetMigConfig(i)
7171
if err != nil {
7272
return nil, fmt.Errorf("error getting MIGConfig: %v", err)

cmd/nvidia-mig-parted/restore/restore.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626
"github.com/sirupsen/logrus"
2727
cli "github.com/urfave/cli/v2"
2828

29+
"github.com/NVIDIA/go-nvml/pkg/nvml"
30+
2931
checkpoint "github.com/NVIDIA/mig-parted/api/checkpoint/v1"
3032
hooks "github.com/NVIDIA/mig-parted/api/hooks/v1"
3133
"github.com/NVIDIA/mig-parted/cmd/nvidia-mig-parted/apply"
@@ -179,7 +181,7 @@ func restoreWrapper(c *cli.Context, f *Flags) error {
179181
Flags: f,
180182
Hooks: apply.NewApplyHooks(hooksSpec.Hooks),
181183
MigState: &checkpoint.MigState,
182-
MigStateManager: state.NewMigStateManager(),
184+
MigStateManager: state.NewMigStateManager(nvml.New()),
183185
}
184186

185187
err = apply.ApplyMigConfigWithHooks(log, c, f.ModeOnly, context.Hooks, &context)

cmd/nvidia-mig-parted/util/mig.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@ package util
1919
import (
2020
"fmt"
2121

22+
"github.com/NVIDIA/go-nvml/pkg/nvml"
23+
2224
"github.com/NVIDIA/mig-parted/pkg/mig/config"
2325
"github.com/NVIDIA/mig-parted/pkg/mig/mode"
2426
)
2527

26-
func NewMigModeManager() (mode.Manager, error) {
28+
func NewMigModeManager(nvmlLib nvml.Interface) (mode.Manager, error) {
2729
nvidiaModuleLoaded, err := IsNvidiaModuleLoaded()
2830
if err != nil {
2931
return nil, fmt.Errorf("error checking if nvidia module loaded: %v", err)
@@ -32,18 +34,18 @@ func NewMigModeManager() (mode.Manager, error) {
3234
return mode.NewPciMigModeManager(), nil
3335
}
3436

35-
nvmlSupported, err := IsNVMLVersionSupported()
37+
nvmlSupported, err := IsNVMLVersionSupported(nvmlLib)
3638
if err != nil {
3739
return nil, fmt.Errorf("error checking NVML version: %v", err)
3840
}
3941
if !nvmlSupported {
4042
return mode.NewPciMigModeManager(), nil
4143
}
4244

43-
return mode.NewNvmlMigModeManager(), nil
45+
return mode.NewNvmlMigModeManager(nvmlLib), nil
4446
}
4547

46-
func NewMigConfigManager() (config.Manager, error) {
48+
func NewMigConfigManager(nvmlLib nvml.Interface) (config.Manager, error) {
4749
nvidiaModuleLoaded, err := IsNvidiaModuleLoaded()
4850
if err != nil {
4951
return nil, fmt.Errorf("error checking if nvidia module loaded: %v", err)
@@ -52,13 +54,13 @@ func NewMigConfigManager() (config.Manager, error) {
5254
return nil, fmt.Errorf("nvidia module not loaded")
5355
}
5456

55-
nvmlSupported, err := IsNVMLVersionSupported()
57+
nvmlSupported, err := IsNVMLVersionSupported(nvmlLib)
5658
if err != nil {
5759
return nil, fmt.Errorf("error checking NVML version: %v", err)
5860
}
5961
if !nvmlSupported {
6062
return nil, fmt.Errorf("NVML version unsupported for performing MIG operations")
6163
}
6264

63-
return config.NewNvmlMigConfigManager(), nil
65+
return config.NewNvmlMigConfigManager(nvmlLib), nil
6466
}

cmd/nvidia-mig-parted/util/nvml.go

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,20 +45,7 @@ func IsNvidiaModuleLoaded() (bool, error) {
4545
return false, nil
4646
}
4747

48-
func IsNVMLVersionSupported() (bool, error) {
49-
nvmlLib := nvml.New()
50-
51-
ret := nvmlLib.Init()
52-
if ret != nvml.SUCCESS {
53-
return false, fmt.Errorf("error initializing NVML: %v", ret)
54-
}
55-
defer func() {
56-
ret := nvmlLib.Shutdown()
57-
if ret != nvml.SUCCESS {
58-
log.Warnf("error shutting down NVML: %v", ret)
59-
}
60-
}()
61-
48+
func IsNVMLVersionSupported(nvmlLib nvml.Interface) (bool, error) {
6249
sversion, ret := nvmlLib.SystemGetNVMLVersion()
6350
if ret != nvml.SUCCESS {
6451
return false, fmt.Errorf("error getting getting version: %v", ret)

pkg/mig/config/config.go

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -41,28 +41,11 @@ type nvmlMigConfigManager struct {
4141

4242
var _ Manager = (*nvmlMigConfigManager)(nil)
4343

44-
func tryNvmlShutdown(nvmlLib nvml.Interface) {
45-
ret := nvmlLib.Shutdown()
46-
if ret != nvml.SUCCESS {
47-
log.Warnf("Error shutting down NVML: %v", ret)
48-
}
49-
}
50-
51-
func NewNvmlMigConfigManager() Manager {
52-
return &nvmlMigConfigManager{nvml.New(), nvlib.New()}
53-
}
54-
55-
func NewMockNvmlMigConfigManager(nvml nvml.Interface) Manager {
44+
func NewNvmlMigConfigManager(nvml nvml.Interface) Manager {
5645
return &nvmlMigConfigManager{nvml, nvlib.NewMock(nvml)}
5746
}
5847

5948
func (m *nvmlMigConfigManager) GetMigConfig(gpu int) (types.MigConfig, error) {
60-
ret := m.nvml.Init()
61-
if ret != nvml.SUCCESS {
62-
return nil, fmt.Errorf("error initializing NVML: %v", ret)
63-
}
64-
defer tryNvmlShutdown(m.nvml)
65-
6649
device, ret := m.nvml.DeviceGetHandleByIndex(gpu)
6750
if ret != nvml.SUCCESS {
6851
return nil, fmt.Errorf("error getting device handle: %v", ret)
@@ -101,12 +84,6 @@ func (m *nvmlMigConfigManager) GetMigConfig(gpu int) (types.MigConfig, error) {
10184
}
10285

10386
func (m *nvmlMigConfigManager) SetMigConfig(gpu int, config types.MigConfig) error {
104-
ret := m.nvml.Init()
105-
if ret != nvml.SUCCESS {
106-
return fmt.Errorf("error initializing NVML: %v", ret)
107-
}
108-
defer tryNvmlShutdown(m.nvml)
109-
11087
device, ret := m.nvml.DeviceGetHandleByIndex(gpu)
11188
if ret != nvml.SUCCESS {
11289
return fmt.Errorf("error getting device handle: %v", ret)
@@ -213,12 +190,6 @@ func (m *nvmlMigConfigManager) SetMigConfig(gpu int, config types.MigConfig) err
213190
}
214191

215192
func (m *nvmlMigConfigManager) ClearMigConfig(gpu int) error {
216-
ret := m.nvml.Init()
217-
if ret != nvml.SUCCESS {
218-
return fmt.Errorf("error initializing NVML: %v", ret)
219-
}
220-
defer tryNvmlShutdown(m.nvml)
221-
222193
device, ret := m.nvml.DeviceGetHandleByIndex(gpu)
223194
if ret != nvml.SUCCESS {
224195
return fmt.Errorf("error getting device handle: %v", ret)

0 commit comments

Comments
 (0)