From fe4b87105b7a8d973f52ee5d15931ffd3980ed1a Mon Sep 17 00:00:00 2001 From: hawkli-1994 <11769524+hawkli-1994@users.noreply.github.com> Date: Tue, 6 Jan 2026 12:34:58 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E5=88=9D=E6=AD=A5=E8=B7=91=E9=80=9A?= =?UTF-8?q?=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 7 ++ README.md | 75 +++++++++++++++++++ api/config/v1/config.go | 10 +++ api/config/v1/consts.go | 1 + api/config/v1/flags.go | 3 + api/config/v1/resources.go | 6 +- cmd/gpu-feature-discovery/main.go | 16 ++++ cmd/gpu-feature-discovery/mig_test.go | 71 ++++++++++++++++++ cmd/nvidia-device-plugin/main.go | 17 +++++ deployments/container/Dockerfile | 2 +- deployments/container/Makefile | 41 +++++++++- .../helm/nvidia-device-plugin/values.yaml | 1 + internal/lm/mig-strategy.go | 19 ++--- internal/lm/nvml.go | 16 ++-- internal/lm/nvml_test.go | 3 +- internal/lm/resource.go | 45 +++++++++-- internal/lm/strategy.go | 7 +- internal/rm/rm.go | 17 ++++- 18 files changed, 327 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 4c5a9150c..8d61cf32b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,10 @@ bin *.out *.log ginkgo.json +docs/custom-resource-name-tasks.md +docs/custom-resource-name-prefix.md +docs/custom-resource-name-prefix_zh.md +CLAUDE.md +custom-nvidia-device-plugin.yaml +build-application.sh +push-application.sh diff --git a/README.md b/README.md index 959e2d45e..1d55e2ed0 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,47 @@ Test PASSED Done ``` +#### Using Custom Resource Name Prefix + +If you have configured the device plugin with a custom resource name prefix, you can request GPUs using that prefix: + +```shell +# First, deploy the device plugin with a custom prefix +cat < [!WARNING] > If you do not request GPUs when you use the device plugin, the plugin exposes all the GPUs on the machine inside your container. @@ -221,6 +262,7 @@ deploying the plugin via `helm`. | `--pass-device-specs` | `$PASS_DEVICE_SPECS` | `false` | | `--device-list-strategy` | `$DEVICE_LIST_STRATEGY` | `"envvar"` | | `--device-id-strategy` | `$DEVICE_ID_STRATEGY` | `"uuid"` | +| `--resource-name-prefix` | `$RESOURCE_NAME_PREFIX` | `"nvidia.com"` | | `--config-file` | `$CONFIG_FILE` | `""` | ### As a configuration file @@ -231,6 +273,7 @@ flags: migStrategy: "none" failOnInitError: true nvidiaDriverRoot: "/" + resourceNamePrefix: "nvidia.com" plugin: passDeviceSpecs: false deviceListStrategy: "envvar" @@ -352,6 +395,38 @@ options outside of this section are shared. launch time. As described below, a `ConfigMap` can be used to point the plugin at a desired configuration file when deploying via `helm`. +**`RESOURCE_NAME_PREFIX`**: + customize the prefix for GPU resource names and labels + + `(default 'nvidia.com')` + + The `RESOURCE_NAME_PREFIX` option allows you to customize the domain prefix + for all GPU-related resources and labels. By default, resources are exposed as + `nvidia.com/gpu`, but you can change this to any custom domain (e.g., + `custom.domain/gpu`). + + This affects: + - GPU resource names in pod specs: `{prefix}/gpu` + - MIG resource names: `{prefix}/mig-*` + - All node labels generated by gpu-feature-discovery: `{prefix}/*` + + **Example**: + ```yaml + # Custom resource name prefix + flags: + resourceNamePrefix: "custom.domain" + + # Resources will be advertised as: custom.domain/gpu + # Labels will be: custom.domain/gpu.count, custom.domain/gpu.product, etc. + ``` + + **Important Notes**: + - The prefix must not contain '/' + - The prefix must be 253 characters or less + - This prefix must match across both the device plugin and gpu-feature-discovery + - Changing this prefix requires updating all pod specs to request resources using the new prefix + - This is useful for multi-tenant clusters or environments requiring custom resource naming + ### Shared Access to GPUs The NVIDIA device plugin allows oversubscription of GPUs through a set of diff --git a/api/config/v1/config.go b/api/config/v1/config.go index 5df5284cf..55807f71c 100644 --- a/api/config/v1/config.go +++ b/api/config/v1/config.go @@ -39,6 +39,16 @@ type Config struct { Imex Imex `json:"imex,omitempty" yaml:"imex,omitempty"` } +// GetResourceNamePrefix returns the configured resource name prefix. +// If not set, it returns the default prefix. +func (c *Config) GetResourceNamePrefix() string { + if c.Flags.ResourceNamePrefix != nil && *c.Flags.ResourceNamePrefix != "" { + return *c.Flags.ResourceNamePrefix + } + return DefaultResourceNamePrefix +} + + // NewConfig builds out a Config struct from a config file (or command line flags). // The data stored in the config will be populated in order of precedence from // (1) command line, (2) environment variable, (3) config file. diff --git a/api/config/v1/consts.go b/api/config/v1/consts.go index 43b526707..73e759a5a 100644 --- a/api/config/v1/consts.go +++ b/api/config/v1/consts.go @@ -22,6 +22,7 @@ import ( // Constants related to resource names const ( + DefaultResourceNamePrefix = "nvidia.com" ResourceNamePrefix = "nvidia.com" DefaultSharedResourceNameSuffix = ".shared" MaxResourceNameLength = 63 diff --git a/api/config/v1/flags.go b/api/config/v1/flags.go index 457602823..95f34fa5d 100644 --- a/api/config/v1/flags.go +++ b/api/config/v1/flags.go @@ -57,6 +57,7 @@ type Flags struct { type CommandLineFlags struct { MigStrategy *string `json:"migStrategy" yaml:"migStrategy"` FailOnInitError *bool `json:"failOnInitError" yaml:"failOnInitError"` + ResourceNamePrefix *string `json:"resourceNamePrefix,omitempty" yaml:"resourceNamePrefix,omitempty"` MpsRoot *string `json:"mpsRoot,omitempty" yaml:"mpsRoot,omitempty"` NvidiaDriverRoot *string `json:"nvidiaDriverRoot,omitempty" yaml:"nvidiaDriverRoot,omitempty"` NvidiaDevRoot *string `json:"nvidiaDevRoot,omitempty" yaml:"nvidiaDevRoot,omitempty"` @@ -121,6 +122,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) { updateFromCLIFlag(&f.MigStrategy, c, n) case "fail-on-init-error": updateFromCLIFlag(&f.FailOnInitError, c, n) + case "resource-name-prefix": + updateFromCLIFlag(&f.ResourceNamePrefix, c, n) case "mps-root": updateFromCLIFlag(&f.MpsRoot, c, n) case "driver-root", "nvidia-driver-root": diff --git a/api/config/v1/resources.go b/api/config/v1/resources.go index 26869da02..65b2cc1b8 100644 --- a/api/config/v1/resources.go +++ b/api/config/v1/resources.go @@ -19,6 +19,7 @@ package v1 import ( "encoding/json" "fmt" + "os" "regexp" "strings" @@ -46,7 +47,8 @@ type Resources struct { // NewResourceName builds a resource name from the standard prefix and a name. // An error is returned if the format is incorrect. func NewResourceName(n string) (ResourceName, error) { - if !strings.HasPrefix(n, ResourceNamePrefix+"/") { + // Only add default prefix if the name doesn't already contain a '/' (i.e., no custom prefix) + if !strings.Contains(n, "/") { n = ResourceNamePrefix + "/" + n } @@ -73,6 +75,8 @@ func NewResource(pattern, name string) (*Resource, error) { Pattern: ResourcePattern(pattern), Name: resourceName, } + // Log to stderr (visible in k8s logs) + fmt.Fprintf(os.Stderr, "[INFO] Created resource: pattern=%s, name=%s\n", pattern, resourceName) return r, nil } diff --git a/cmd/gpu-feature-discovery/main.go b/cmd/gpu-feature-discovery/main.go index c6a505efd..51a3ae124 100644 --- a/cmd/gpu-feature-discovery/main.go +++ b/cmd/gpu-feature-discovery/main.go @@ -61,6 +61,12 @@ func main() { Usage: "fail the plugin if an error is encountered during initialization, otherwise block indefinitely", EnvVars: []string{"GFD_FAIL_ON_INIT_ERROR", "FAIL_ON_INIT_ERROR"}, }, + &cli.StringFlag{ + Name: "resource-name-prefix", + Value: "nvidia.com", + Usage: "the prefix to use for resource names (e.g., 'nvidia.com' for nvidia.com/gpu)", + EnvVars: []string{"GFD_RESOURCE_NAME_PREFIX", "RESOURCE_NAME_PREFIX"}, + }, &cli.BoolFlag{ Name: "oneshot", Value: false, @@ -138,6 +144,16 @@ func validateFlags(config *spec.Config) error { default: return fmt.Errorf("invalid --device-discovery-strategy option %v", *config.Flags.DeviceDiscoveryStrategy) } + + // Validate resource name prefix format + if config.Flags.ResourceNamePrefix != nil && *config.Flags.ResourceNamePrefix != "" { + prefix := *config.Flags.ResourceNamePrefix + if prefix != "nvidia.com" { + klog.Warningf("Using custom resource name prefix: %s (default is nvidia.com)", prefix) + klog.Warning("All pods requesting GPU resources must be updated to use the new resource name format") + } + } + return nil } diff --git a/cmd/gpu-feature-discovery/mig_test.go b/cmd/gpu-feature-discovery/mig_test.go index b8e24b6f9..d37b04ccd 100644 --- a/cmd/gpu-feature-discovery/mig_test.go +++ b/cmd/gpu-feature-discovery/mig_test.go @@ -352,3 +352,74 @@ func TestMigStrategyMixed(t *testing.T) { require.Contains(t, labels, "nvidia.com/mig-3g.20gb.count", "Missing label") require.Contains(t, labels, "nvidia.com/mig-1g.5gb.count", "Missing label") } + +func TestMigStrategySingleWithCustomPrefix(t *testing.T) { + // create VGPU mock library with empty vgpu devices + vgpuMock := NewTestVGPUMock() + devices := []resource.Device{ + rt.NewMigEnabledDevice( + rt.NewMigDevice(3, 0, 20), + rt.NewMigDevice(3, 0, 20), + ), + } + nvmlMock := rt.NewManagerMockWithDevices(devices...) + + conf := &spec.Config{ + Flags: spec.Flags{ + CommandLineFlags: spec.CommandLineFlags{ + MigStrategy: ptr("single"), + ResourceNamePrefix: ptr("custom.domain"), + FailOnInitError: ptr(true), + GFD: &spec.GFDCommandLineFlags{ + Oneshot: ptr(true), + OutputFile: ptr("./gfd-test-mig-single-custom"), + SleepInterval: ptr(spec.Duration(time.Second)), + NoTimestamp: ptr(false), + MachineTypeFile: ptr(testMachineTypeFile), + }, + }, + }, + } + + setupMachineFile(t) + defer removeMachineFile(t) + + labelOutputer, err := lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}) + require.NoError(t, err) + + d := gfd{ + manager: nvmlMock, + vgpu: vgpuMock, + config: conf, + labelOutputer: labelOutputer, + } + restart, err := d.run(nil) + require.NoError(t, err, "Error from run function") + require.False(t, restart) + + outFile, err := os.Open(*conf.Flags.GFD.OutputFile) + require.NoError(t, err, "Opening output file") + + defer func() { + err = outFile.Close() + require.NoError(t, err, "Closing output file") + err = os.Remove(*conf.Flags.GFD.OutputFile) + require.NoError(t, err, "Removing output file") + }() + + output, err := io.ReadAll(outFile) + require.NoError(t, err, "Reading output file") + + labels, err := buildLabelMapFromOutput(output) + require.NoError(t, err, "Building map of labels from output file") + + // Verify custom prefix is used in labels + require.Equal(t, labels["custom.domain/mig.strategy"], "single", "Incorrect label") + require.Equal(t, labels["custom.domain/gpu.count"], "2", "Incorrect label") + require.Equal(t, labels["custom.domain/gpu.product"], "MOCKMODEL-MIG-3g.20gb", "Incorrect label") + require.Equal(t, labels["custom.domain/gpu.memory"], "20", "Incorrect label") + + // Verify default nvidia.com labels are NOT present + require.NotContains(t, labels, "nvidia.com/mig.strategy", "Default prefix should not be present") + require.NotContains(t, labels, "nvidia.com/gpu.count", "Default prefix should not be present") +} diff --git a/cmd/nvidia-device-plugin/main.go b/cmd/nvidia-device-plugin/main.go index 4c6754317..9904a8515 100644 --- a/cmd/nvidia-device-plugin/main.go +++ b/cmd/nvidia-device-plugin/main.go @@ -69,6 +69,12 @@ func main() { Usage: "fail the plugin if an error is encountered during initialization, otherwise block indefinitely", EnvVars: []string{"FAIL_ON_INIT_ERROR"}, }, + &cli.StringFlag{ + Name: "resource-name-prefix", + Value: "nvidia.com", + Usage: "the prefix to use for resource names (e.g., 'nvidia.com' for nvidia.com/gpu)", + EnvVars: []string{"RESOURCE_NAME_PREFIX"}, + }, &cli.StringFlag{ Name: "driver-root", Aliases: []string{"nvidia-driver-root"}, @@ -228,6 +234,17 @@ func validateFlags(infolib nvinfo.Interface, config *spec.Config) error { return fmt.Errorf("invalid IMEX channel IDs: %w", err) } + // Validate resource name prefix format + if config.Flags.ResourceNamePrefix != nil && *config.Flags.ResourceNamePrefix != "" { + prefix := *config.Flags.ResourceNamePrefix + if prefix == "nvidia.com" { + // This is the default, no special validation needed + return nil + } + klog.Warningf("Using custom resource name prefix: %s (default is nvidia.com)", prefix) + klog.Warning("All pods requesting GPU resources must be updated to use the new resource name format") + } + return nil } diff --git a/deployments/container/Dockerfile b/deployments/container/Dockerfile index e85b4f135..adba6b8fa 100644 --- a/deployments/container/Dockerfile +++ b/deployments/container/Dockerfile @@ -20,7 +20,7 @@ FROM base AS devel WORKDIR /work COPY * . -ARG GOPROXY="https://proxy.golang.org,direct" +ARG GOPROXY="https://goproxy.cn,direct" ENV GOPROXY=$GOPROXY RUN make install-tools diff --git a/deployments/container/Makefile b/deployments/container/Makefile index 1a838fdc6..d4be72679 100644 --- a/deployments/container/Makefile +++ b/deployments/container/Makefile @@ -16,6 +16,10 @@ BUILD_MULTI_ARCH_IMAGES ?= no DOCKER ?= docker MKDIR ?= mkdir +# Control whether to check for updates to base images during build +# Set to 'false' or empty string to skip checking and use local images only +DOCKER_PULL ?= false + ##### Global variables ##### include $(CURDIR)/versions.mk @@ -43,8 +47,9 @@ IMAGE_TARGETS := $(patsubst %,image-%,$(DISTRIBUTIONS)) BUILD_TARGETS := $(patsubst %,build-%,$(DISTRIBUTIONS)) PUSH_TARGETS := $(patsubst %,push-%,$(DISTRIBUTIONS)) TEST_TARGETS := $(patsubst %,test-%, $(DISTRIBUTIONS)) +APPLICATION_TARGETS := $(patsubst %,application-%,$(DISTRIBUTIONS)) -.PHONY: $(DISTRIBUTIONS) $(PUSH_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) $(BUILD_TARGETS) +.PHONY: $(DISTRIBUTIONS) $(PUSH_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) $(BUILD_TARGETS) $(APPLICATION_TARGETS) application push-application ifneq ($(BUILD_MULTI_ARCH_IMAGES),true) include $(CURDIR)/deployments/container/native-only.mk @@ -66,9 +71,16 @@ endif DOCKERFILE = $(CURDIR)/deployments/container/Dockerfile +# Conditionally add --pull flag based on DOCKER_PULL variable +ifeq ($(DOCKER_PULL),true) +DOCKER_PULL_FLAG = --pull +else +DOCKER_PULL_FLAG = +endif + # Use a generic build target to build the relevant images $(IMAGE_TARGETS): image-%: - $(DOCKER) build --pull \ + $(DOCKER) build $(DOCKER_PULL_FLAG) \ --provenance=false --sbom=false \ $(DOCKER_BUILD_OPTIONS) \ $(DOCKER_BUILD_PLATFORM_OPTIONS) \ @@ -80,6 +92,24 @@ $(IMAGE_TARGETS): image-%: -f $(DOCKERFILE) \ $(CURDIR) +# Build only the application stage (final runtime image) +# This is useful for building production-ready images without development tools +.PHONY: application +application: $(APPLICATION_TARGETS) +$(APPLICATION_TARGETS): application-%: + $(DOCKER) build $(DOCKER_PULL_FLAG) \ + --provenance=false --sbom=false \ + --target application \ + $(DOCKER_BUILD_OPTIONS) \ + $(DOCKER_BUILD_PLATFORM_OPTIONS) \ + --tag $(IMAGE) \ + --build-arg VERSION="$(VERSION)" \ + --build-arg GIT_COMMIT="$(GIT_COMMIT)" \ + --build-arg GOPROXY="$(GOPROXY)" \ + $(if $(LABEL_IMAGE_SOURCE),--label "org.opencontainers.image.source=$(LABEL_IMAGE_SOURCE)",) \ + -f $(DOCKERFILE) \ + $(CURDIR) + # Handle the default build target. .PHONY: build build: $(DEFAULT_PUSH_TARGET) @@ -91,6 +121,13 @@ $(PUSH_TARGETS): push-%: image copy \ $(IMAGE) $(OUT_IMAGE) +# Push the application stage image +.PHONY: push-application +push-application: application + $(REGCTL) \ + image copy \ + $(IMAGE) $(OUT_IMAGE) + push-short: $(REGCTL) \ image copy \ diff --git a/deployments/helm/nvidia-device-plugin/values.yaml b/deployments/helm/nvidia-device-plugin/values.yaml index a02688cfa..44d8fb960 100644 --- a/deployments/helm/nvidia-device-plugin/values.yaml +++ b/deployments/helm/nvidia-device-plugin/values.yaml @@ -32,6 +32,7 @@ migStrategy: null failOnInitError: null deviceListStrategy: null deviceIDStrategy: null +resourceNamePrefix: null nvidiaDriverRoot: null gdrcopyEnabled: null gdsEnabled: null diff --git a/internal/lm/mig-strategy.go b/internal/lm/mig-strategy.go index f25ef91b2..59e0dc12a 100644 --- a/internal/lm/mig-strategy.go +++ b/internal/lm/mig-strategy.go @@ -103,7 +103,7 @@ func newMigLabeler(manager resource.Manager, config *spec.Config) (Labeler, erro } labelers := Merge( - migStrategyLabeler(*config.Flags.MigStrategy), + migStrategyLabeler(config, *config.Flags.MigStrategy), labeler, ) @@ -157,7 +157,7 @@ func newGPULabelers(manager resource.Manager, config *spec.Config) (Labeler, err // These do not include sharing information. for name, migEnabledDevice := range migEnabledDevices { // We generate a resource label with no sharing modifications - l, err := NewGPUResourceLabelerWithoutSharing(migEnabledDevice, counts[name]) + l, err := NewGPUResourceLabelerWithoutSharing(config, migEnabledDevice, counts[name]) if err != nil { return nil, fmt.Errorf("failed to construct labeler: %v", err) } @@ -196,7 +196,7 @@ func newMigStrategySingleLabeler(manager resource.Manager, config *spec.Config) } // If any migEnabled=true device is empty, we return the set of mig-strategy-invalid labels. if hasEmpty { - return newInvalidMigStrategyLabeler(migEnabledDevices[0], "at least one MIG device is enabled but empty") + return newInvalidMigStrategyLabeler(config, migEnabledDevices[0], "at least one MIG device is enabled but empty") } migDisabledDevices, err := deviceInfo.GetDevicesWithMigDisabled() @@ -205,7 +205,7 @@ func newMigStrategySingleLabeler(manager resource.Manager, config *spec.Config) } // If we have a mix of mig-enabled and mig-disabled device we return the set of mig-strategy-invalid labels if len(migDisabledDevices) != 0 { - return newInvalidMigStrategyLabeler(migEnabledDevices[0], "devices with MIG enabled and disable detected") + return newInvalidMigStrategyLabeler(config, migEnabledDevices[0], "devices with MIG enabled and disable detected") } migs, err := deviceInfo.GetAllMigDevices() @@ -225,7 +225,7 @@ func newMigStrategySingleLabeler(manager resource.Manager, config *spec.Config) // For the first occurrence we update the device reference and the resource name if !exists { resource.device = mig - resource.name = fullGPUResourceName + resource.name = spec.ResourceName(config.GetResourceNamePrefix() + "/gpu") } // We increase the count resource.count++ @@ -235,13 +235,13 @@ func newMigStrategySingleLabeler(manager resource.Manager, config *spec.Config) // Multiple resources mean that we have more than one MIG profile defined. Return the set of mig-strategy-invalid labels. if len(resources) != 1 { - return newInvalidMigStrategyLabeler(migEnabledDevices[0], "more than one MIG device type present on node") + return newInvalidMigStrategyLabeler(config, migEnabledDevices[0], "more than one MIG device type present on node") } return newMIGDeviceLabelers(resources, config) } -func newInvalidMigStrategyLabeler(device resource.Device, reason string) (Labeler, error) { +func newInvalidMigStrategyLabeler(config *spec.Config, device resource.Device, reason string) (Labeler, error) { klog.Warningf("Invalid configuration detected for mig-strategy=single: %v", reason) model, err := device.GetName() @@ -249,8 +249,9 @@ func newInvalidMigStrategyLabeler(device resource.Device, reason string) (Labele return nil, fmt.Errorf("failed to get device model: %v", err) } + prefix := config.GetResourceNamePrefix() rl := resourceLabeler{ - resourceName: "nvidia.com/gpu", + resourceName: spec.ResourceName(prefix + "/gpu"), } labels := rl.productLabel(model, "MIG", "INVALID") @@ -285,7 +286,7 @@ func newMigStrategyMixedLabeler(manager resource.Manager, config *spec.Config) ( // For the first occurrence we update the device reference and the resource name if !exists { resource.device = mig - resource.name = spec.ResourceName("nvidia.com/mig-" + name) + resource.name = spec.ResourceName(config.GetResourceNamePrefix() + "/mig-" + name) } // We increase the count resource.count++ diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index 4d3e00496..96b443bc8 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -60,7 +60,7 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e return nil, fmt.Errorf("failed to construct version labeler: %v", err) } - migCapabilityLabeler, err := newMigCapabilityLabeler(manager) + migCapabilityLabeler, err := newMigCapabilityLabeler(config, manager) if err != nil { return nil, fmt.Errorf("error creating mig capability labeler: %v", err) } @@ -144,7 +144,7 @@ func newVersionLabeler(manager resource.Manager) (Labeler, error) { // newMigCapabilityLabeler creates a new MIG capability labeler using the provided NVML library. // If any GPU on the node is mig-capable the label is set to true. -func newMigCapabilityLabeler(manager resource.Manager) (Labeler, error) { +func newMigCapabilityLabeler(config *spec.Config, manager resource.Manager) (Labeler, error) { isMigCapable := false devices, err := manager.GetDevices() @@ -167,16 +167,22 @@ func newMigCapabilityLabeler(manager resource.Manager) (Labeler, error) { } } + prefix := config.GetResourceNamePrefix() labels := Labels{ - "nvidia.com/mig.capable": strconv.FormatBool(isMigCapable), + prefix + "/mig.capable": strconv.FormatBool(isMigCapable), } return labels, nil } func newSharingLabeler(manager resource.Manager, config *spec.Config) (Labeler, error) { + prefix := spec.DefaultResourceNamePrefix + if config != nil { + prefix = config.GetResourceNamePrefix() + } + if config == nil || config.Sharing.SharingStrategy() != spec.SharingStrategyMPS { labels := Labels{ - "nvidia.com/mps.capable": "false", + prefix + "/mps.capable": "false", } return labels, nil } @@ -187,7 +193,7 @@ func newSharingLabeler(manager resource.Manager, config *spec.Config) (Labeler, } labels := Labels{ - "nvidia.com/mps.capable": strconv.FormatBool(capable), + prefix + "/mps.capable": strconv.FormatBool(capable), } return labels, nil } diff --git a/internal/lm/nvml_test.go b/internal/lm/nvml_test.go index 073721cd8..ceca980ec 100644 --- a/internal/lm/nvml_test.go +++ b/internal/lm/nvml_test.go @@ -63,8 +63,9 @@ func TestMigCapabilityLabeler(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { nvmlMock := rt.NewManagerMockWithDevices(tc.devices...) + config := &spec.Config{} - migCapabilityLabeler, _ := newMigCapabilityLabeler(nvmlMock) + migCapabilityLabeler, _ := newMigCapabilityLabeler(config, nvmlMock) labels, err := migCapabilityLabeler.Labels() if tc.expectedError { diff --git a/internal/lm/resource.go b/internal/lm/resource.go index f85f38f20..5c240cb9d 100644 --- a/internal/lm/resource.go +++ b/internal/lm/resource.go @@ -27,12 +27,43 @@ import ( "github.com/NVIDIA/k8s-device-plugin/internal/resource" ) -const fullGPUResourceName = "nvidia.com/gpu" - // NewGPUResourceLabelerWithoutSharing creates a resource labeler for the specified device that does not apply sharing labels. -func NewGPUResourceLabelerWithoutSharing(device resource.Device, count int) (Labeler, error) { - // NOTE: We use a nil config to signal that sharing is disabled. - return NewGPUResourceLabeler(nil, device, count) +func NewGPUResourceLabelerWithoutSharing(config *spec.Config, device resource.Device, count int) (Labeler, error) { + if count == 0 { + return empty{}, nil + } + + model, err := device.GetName() + if err != nil { + return nil, fmt.Errorf("failed to get device model: %v", err) + } + + totalMemoryMiB, err := device.GetTotalMemoryMiB() + if err != nil { + klog.Warningf("Ignoring error getting memory info for device: %v", err) + } + + // Get the resource name from config, but pass nil config to newResourceLabeler to disable sharing + resourceName := spec.ResourceName(config.GetResourceNamePrefix() + "/gpu") + resourceLabeler := newResourceLabeler(resourceName, nil) + + architectureLabels, err := newArchitectureLabels(resourceLabeler, device) + if err != nil { + return nil, fmt.Errorf("failed to create architecture labels: %v", err) + } + + memoryLabeler := (Labeler)(&empty{}) + if totalMemoryMiB != 0 { + memoryLabeler = resourceLabeler.single("memory", totalMemoryMiB) + } + + labelers := Merge( + resourceLabeler.baseLabeler(count, model), + memoryLabeler, + architectureLabels, + ) + + return labelers, nil } // NewGPUResourceLabeler creates a resource labeler for the specified full GPU device with the specified count @@ -51,7 +82,9 @@ func NewGPUResourceLabeler(config *spec.Config, device resource.Device, count in klog.Warningf("Ignoring error getting memory info for device: %v", err) } - resourceLabeler := newResourceLabeler(fullGPUResourceName, config) + // Get the resource name from config + resourceName := spec.ResourceName(config.GetResourceNamePrefix() + "/gpu") + resourceLabeler := newResourceLabeler(resourceName, config) architectureLabels, err := newArchitectureLabels(resourceLabeler, device) if err != nil { diff --git a/internal/lm/strategy.go b/internal/lm/strategy.go index 170adc336..de4f7002e 100644 --- a/internal/lm/strategy.go +++ b/internal/lm/strategy.go @@ -16,13 +16,16 @@ package lm +import spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + // migStrategyLabeler creates a labler for setting the mig strategy label -func migStrategyLabeler(strategy string) Labeler { +func migStrategyLabeler(config *spec.Config, strategy string) Labeler { if strategy == MigStrategyNone { return empty{} } + prefix := config.GetResourceNamePrefix() return Labels{ - "nvidia.com/mig.strategy": strategy, + prefix + "/mig.strategy": strategy, } } diff --git a/internal/rm/rm.go b/internal/rm/rm.go index 33f44b9d8..c172d276b 100644 --- a/internal/rm/rm.go +++ b/internal/rm/rm.go @@ -97,13 +97,23 @@ func (r *resourceManager) ValidateRequest(ids AnnotatedIDs) error { // AddDefaultResourcesToConfig adds default resource matching rules to config.Resources func AddDefaultResourcesToConfig(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interface, config *spec.Config) error { - _ = config.Resources.AddGPUResource("*", "gpu") + klog.Infof("DEBUG: AddDefaultResourcesToConfig called, config.Resources pointer: %p", &config.Resources) + prefix := config.GetResourceNamePrefix() + gpuResourceName := prefix + "/gpu" + klog.Infof("Adding default GPU resource: pattern='*', name='%s'", gpuResourceName) + err := config.Resources.AddGPUResource("*", gpuResourceName) + if err != nil { + return fmt.Errorf("error adding GPU resource: %w", err) + } + klog.Infof("Successfully added GPU resource. Total GPUs in config: %d", len(config.Resources.GPUs)) if config.Flags.MigStrategy == nil { + klog.Infof("MigStrategy is nil, returning early") return nil } + klog.Infof("MigStrategy is: %s", *config.Flags.MigStrategy) switch *config.Flags.MigStrategy { case spec.MigStrategySingle: - return config.Resources.AddMIGResource("*", "gpu") + return config.Resources.AddMIGResource("*", gpuResourceName) case spec.MigStrategyMixed: hasNVML, reason := infolib.HasNvml() if !hasNVML { @@ -132,7 +142,8 @@ func AddDefaultResourcesToConfig(infolib info.Interface, nvmllib nvml.Interface, return nil } resourceName := strings.ReplaceAll("mig-"+p.String(), "+", ".") - return config.Resources.AddMIGResource(p.String(), resourceName) + migResourceName := prefix + "/" + resourceName + return config.Resources.AddMIGResource(p.String(), migResourceName) }) } return nil From 272b86c04906b5c5467e5fde1debeb4c087abc28 Mon Sep 17 00:00:00 2001 From: hawkli-1994 <11769524+hawkli-1994@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:37:22 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20.gitignore=20=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8d61cf32b..07a382d22 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ CLAUDE.md custom-nvidia-device-plugin.yaml build-application.sh push-application.sh +docs/nvidia-device-plugin-driver-interaction.md