Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,11 @@ bin
*.out
*.log
ginkgo.json
docs/custom-resource-name-tasks.md
docs/custom-resource-name-prefix.md
docs/custom-resource-name-prefix_zh.md
CLAUDE.md
custom-nvidia-device-plugin.yaml
build-application.sh
push-application.sh
docs/nvidia-device-plugin-driver-interaction.md
75 changes: 75 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,47 @@ Test PASSED
Done
```

#### Using Custom Resource Name Prefix

If you have configured the device plugin with a custom resource name prefix, you can request GPUs using that prefix:

```shell
# First, deploy the device plugin with a custom prefix
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-plugin-config
namespace: nvidia-device-plugin
data:
config.yaml: |
version: v1
flags:
resourceNamePrefix: "custom.domain"
EOF
```

```shell
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod-custom
spec:
restartPolicy: Never
containers:
- name: cuda-container
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0
resources:
limits:
custom.domain/gpu: 1 # requesting 1 GPU with custom prefix
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
EOF
```

> [!WARNING]
> If you do not request GPUs when you use the device plugin, the plugin exposes all the GPUs on the machine inside your container.

Expand All @@ -221,6 +262,7 @@ deploying the plugin via `helm`.
| `--pass-device-specs` | `$PASS_DEVICE_SPECS` | `false` |
| `--device-list-strategy` | `$DEVICE_LIST_STRATEGY` | `"envvar"` |
| `--device-id-strategy` | `$DEVICE_ID_STRATEGY` | `"uuid"` |
| `--resource-name-prefix` | `$RESOURCE_NAME_PREFIX` | `"nvidia.com"` |
| `--config-file` | `$CONFIG_FILE` | `""` |

### As a configuration file
Expand All @@ -231,6 +273,7 @@ flags:
migStrategy: "none"
failOnInitError: true
nvidiaDriverRoot: "/"
resourceNamePrefix: "nvidia.com"
plugin:
passDeviceSpecs: false
deviceListStrategy: "envvar"
Expand Down Expand Up @@ -352,6 +395,38 @@ options outside of this section are shared.
launch time. As described below, a `ConfigMap` can be used to point the
plugin at a desired configuration file when deploying via `helm`.

**`RESOURCE_NAME_PREFIX`**:
customize the prefix for GPU resource names and labels

`(default 'nvidia.com')`

The `RESOURCE_NAME_PREFIX` option allows you to customize the domain prefix
for all GPU-related resources and labels. By default, resources are exposed as
`nvidia.com/gpu`, but you can change this to any custom domain (e.g.,
`custom.domain/gpu`).

This affects:
- GPU resource names in pod specs: `{prefix}/gpu`
- MIG resource names: `{prefix}/mig-*`
- All node labels generated by gpu-feature-discovery: `{prefix}/*`

**Example**:
```yaml
# Custom resource name prefix
flags:
resourceNamePrefix: "custom.domain"

# Resources will be advertised as: custom.domain/gpu
# Labels will be: custom.domain/gpu.count, custom.domain/gpu.product, etc.
```

**Important Notes**:
- The prefix must not contain '/'
- The prefix must be 253 characters or less
- This prefix must match across both the device plugin and gpu-feature-discovery
- Changing this prefix requires updating all pod specs to request resources using the new prefix
- This is useful for multi-tenant clusters or environments requiring custom resource naming

### Shared Access to GPUs

The NVIDIA device plugin allows oversubscription of GPUs through a set of
Expand Down
10 changes: 10 additions & 0 deletions api/config/v1/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ type Config struct {
Imex Imex `json:"imex,omitempty" yaml:"imex,omitempty"`
}

// GetResourceNamePrefix returns the configured resource name prefix.
// If not set, it returns the default prefix.
func (c *Config) GetResourceNamePrefix() string {
if c.Flags.ResourceNamePrefix != nil && *c.Flags.ResourceNamePrefix != "" {
return *c.Flags.ResourceNamePrefix
}
return DefaultResourceNamePrefix
}


// NewConfig builds out a Config struct from a config file (or command line flags).
// The data stored in the config will be populated in order of precedence from
// (1) command line, (2) environment variable, (3) config file.
Expand Down
1 change: 1 addition & 0 deletions api/config/v1/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

// Constants related to resource names
const (
DefaultResourceNamePrefix = "nvidia.com"
ResourceNamePrefix = "nvidia.com"
DefaultSharedResourceNameSuffix = ".shared"
MaxResourceNameLength = 63
Expand Down
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type Flags struct {
type CommandLineFlags struct {
MigStrategy *string `json:"migStrategy" yaml:"migStrategy"`
FailOnInitError *bool `json:"failOnInitError" yaml:"failOnInitError"`
ResourceNamePrefix *string `json:"resourceNamePrefix,omitempty" yaml:"resourceNamePrefix,omitempty"`
MpsRoot *string `json:"mpsRoot,omitempty" yaml:"mpsRoot,omitempty"`
NvidiaDriverRoot *string `json:"nvidiaDriverRoot,omitempty" yaml:"nvidiaDriverRoot,omitempty"`
NvidiaDevRoot *string `json:"nvidiaDevRoot,omitempty" yaml:"nvidiaDevRoot,omitempty"`
Expand Down Expand Up @@ -121,6 +122,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.MigStrategy, c, n)
case "fail-on-init-error":
updateFromCLIFlag(&f.FailOnInitError, c, n)
case "resource-name-prefix":
updateFromCLIFlag(&f.ResourceNamePrefix, c, n)
case "mps-root":
updateFromCLIFlag(&f.MpsRoot, c, n)
case "driver-root", "nvidia-driver-root":
Expand Down
6 changes: 5 additions & 1 deletion api/config/v1/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package v1
import (
"encoding/json"
"fmt"
"os"
"regexp"
"strings"

Expand Down Expand Up @@ -46,7 +47,8 @@ type Resources struct {
// NewResourceName builds a resource name from the standard prefix and a name.
// An error is returned if the format is incorrect.
func NewResourceName(n string) (ResourceName, error) {
if !strings.HasPrefix(n, ResourceNamePrefix+"/") {
// Only add default prefix if the name doesn't already contain a '/' (i.e., no custom prefix)
if !strings.Contains(n, "/") {
n = ResourceNamePrefix + "/" + n
}

Expand All @@ -73,6 +75,8 @@ func NewResource(pattern, name string) (*Resource, error) {
Pattern: ResourcePattern(pattern),
Name: resourceName,
}
// Log to stderr (visible in k8s logs)
fmt.Fprintf(os.Stderr, "[INFO] Created resource: pattern=%s, name=%s\n", pattern, resourceName)
return r, nil
}

Expand Down
16 changes: 16 additions & 0 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ func main() {
Usage: "fail the plugin if an error is encountered during initialization, otherwise block indefinitely",
EnvVars: []string{"GFD_FAIL_ON_INIT_ERROR", "FAIL_ON_INIT_ERROR"},
},
&cli.StringFlag{
Name: "resource-name-prefix",
Value: "nvidia.com",
Usage: "the prefix to use for resource names (e.g., 'nvidia.com' for nvidia.com/gpu)",
EnvVars: []string{"GFD_RESOURCE_NAME_PREFIX", "RESOURCE_NAME_PREFIX"},
},
&cli.BoolFlag{
Name: "oneshot",
Value: false,
Expand Down Expand Up @@ -138,6 +144,16 @@ func validateFlags(config *spec.Config) error {
default:
return fmt.Errorf("invalid --device-discovery-strategy option %v", *config.Flags.DeviceDiscoveryStrategy)
}

// Validate resource name prefix format
if config.Flags.ResourceNamePrefix != nil && *config.Flags.ResourceNamePrefix != "" {
prefix := *config.Flags.ResourceNamePrefix
if prefix != "nvidia.com" {
klog.Warningf("Using custom resource name prefix: %s (default is nvidia.com)", prefix)
klog.Warning("All pods requesting GPU resources must be updated to use the new resource name format")
}
}

return nil
}

Expand Down
71 changes: 71 additions & 0 deletions cmd/gpu-feature-discovery/mig_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,74 @@ func TestMigStrategyMixed(t *testing.T) {
require.Contains(t, labels, "nvidia.com/mig-3g.20gb.count", "Missing label")
require.Contains(t, labels, "nvidia.com/mig-1g.5gb.count", "Missing label")
}

func TestMigStrategySingleWithCustomPrefix(t *testing.T) {
// create VGPU mock library with empty vgpu devices
vgpuMock := NewTestVGPUMock()
devices := []resource.Device{
rt.NewMigEnabledDevice(
rt.NewMigDevice(3, 0, 20),
rt.NewMigDevice(3, 0, 20),
),
}
nvmlMock := rt.NewManagerMockWithDevices(devices...)

conf := &spec.Config{
Flags: spec.Flags{
CommandLineFlags: spec.CommandLineFlags{
MigStrategy: ptr("single"),
ResourceNamePrefix: ptr("custom.domain"),
FailOnInitError: ptr(true),
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(true),
OutputFile: ptr("./gfd-test-mig-single-custom"),
SleepInterval: ptr(spec.Duration(time.Second)),
NoTimestamp: ptr(false),
MachineTypeFile: ptr(testMachineTypeFile),
},
},
},
}

setupMachineFile(t)
defer removeMachineFile(t)

labelOutputer, err := lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{})
require.NoError(t, err)

d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: labelOutputer,
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

outFile, err := os.Open(*conf.Flags.GFD.OutputFile)
require.NoError(t, err, "Opening output file")

defer func() {
err = outFile.Close()
require.NoError(t, err, "Closing output file")
err = os.Remove(*conf.Flags.GFD.OutputFile)
require.NoError(t, err, "Removing output file")
}()

output, err := io.ReadAll(outFile)
require.NoError(t, err, "Reading output file")

labels, err := buildLabelMapFromOutput(output)
require.NoError(t, err, "Building map of labels from output file")

// Verify custom prefix is used in labels
require.Equal(t, labels["custom.domain/mig.strategy"], "single", "Incorrect label")
require.Equal(t, labels["custom.domain/gpu.count"], "2", "Incorrect label")
require.Equal(t, labels["custom.domain/gpu.product"], "MOCKMODEL-MIG-3g.20gb", "Incorrect label")
require.Equal(t, labels["custom.domain/gpu.memory"], "20", "Incorrect label")

// Verify default nvidia.com labels are NOT present
require.NotContains(t, labels, "nvidia.com/mig.strategy", "Default prefix should not be present")
require.NotContains(t, labels, "nvidia.com/gpu.count", "Default prefix should not be present")
}
17 changes: 17 additions & 0 deletions cmd/nvidia-device-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ func main() {
Usage: "fail the plugin if an error is encountered during initialization, otherwise block indefinitely",
EnvVars: []string{"FAIL_ON_INIT_ERROR"},
},
&cli.StringFlag{
Name: "resource-name-prefix",
Value: "nvidia.com",
Usage: "the prefix to use for resource names (e.g., 'nvidia.com' for nvidia.com/gpu)",
EnvVars: []string{"RESOURCE_NAME_PREFIX"},
},
&cli.StringFlag{
Name: "driver-root",
Aliases: []string{"nvidia-driver-root"},
Expand Down Expand Up @@ -228,6 +234,17 @@ func validateFlags(infolib nvinfo.Interface, config *spec.Config) error {
return fmt.Errorf("invalid IMEX channel IDs: %w", err)
}

// Validate resource name prefix format
if config.Flags.ResourceNamePrefix != nil && *config.Flags.ResourceNamePrefix != "" {
prefix := *config.Flags.ResourceNamePrefix
if prefix == "nvidia.com" {
// This is the default, no special validation needed
return nil
}
klog.Warningf("Using custom resource name prefix: %s (default is nvidia.com)", prefix)
klog.Warning("All pods requesting GPU resources must be updated to use the new resource name format")
}

return nil
}

Expand Down
2 changes: 1 addition & 1 deletion deployments/container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ FROM base AS devel
WORKDIR /work
COPY * .

ARG GOPROXY="https://proxy.golang.org,direct"
ARG GOPROXY="https://goproxy.cn,direct"
ENV GOPROXY=$GOPROXY

RUN make install-tools
Expand Down
Loading