Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions cmd/nvidia-validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
devchar "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/system/create-dev-char-symlinks"
log "github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert/yaml"
cli "github.com/urfave/cli/v3"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand Down Expand Up @@ -67,6 +68,9 @@ type NvidiaFs struct{}
// GDRCopy driver component
type GDRCopy struct{}

// NvidiaPeermem driver component
type NvidiaPeermem struct{}

// CUDA represents spec to run cuda workload
type CUDA struct {
ctx context.Context
Expand Down Expand Up @@ -150,12 +154,16 @@ const (
defaultDriverInstallDir = "/run/nvidia/driver"
// defaultDriverInstallDirCtrPath indicates the default path where the NVIDIA driver install dir is mounted in the container
defaultDriverInstallDirCtrPath = "/run/nvidia/driver"
// additionalDriversFlagsFilePath indicates the path to the file which contains additional drivers status flags
additionalDriversFlagsFilePath = defaultDriverInstallDirCtrPath + "/.additional-drivers-flags"
// driverStatusFile indicates status file for containerizeddriver readiness
driverStatusFile = "driver-ready"
// nvidiaFsStatusFile indicates status file for nvidia-fs driver readiness
nvidiaFsStatusFile = "nvidia-fs-ready"
// gdrCopyStatusFile indicates status file for GDRCopy driver (gdrdrv) readiness
gdrCopyStatusFile = "gdrcopy-ready"
// nvidiaPeermemStatusFile indicates status file for nvidia-peermem driver readiness
nvidiaPeermemStatusFile = "nvidia-peermem-ready"
// toolkitStatusFile indicates status file for toolkit readiness
toolkitStatusFile = "toolkit-ready"
// pluginStatusFile indicates status file for plugin readiness
Expand Down Expand Up @@ -445,6 +453,8 @@ func isValidComponent() bool {
case "nvidia-fs":
fallthrough
case "gdrcopy":
fallthrough
case "nvidia-peermem":
return true
default:
return false
Expand Down Expand Up @@ -509,6 +519,10 @@ func start(ctx context.Context, cli *cli.Command) error {
return err
}

return validateComponent(ctx, componentFlag)
}

func validateComponent(ctx context.Context, componentFlag string) error {
switch componentFlag {
case "driver":
driver := &Driver{
Expand All @@ -533,6 +547,13 @@ func start(ctx context.Context, cli *cli.Command) error {
return fmt.Errorf("error validating gdrcopy driver installation: %w", err)
}
return nil
case "nvidia-peermem":
nvidiaPeermem := &NvidiaPeermem{}
err := nvidiaPeermem.validate()
if err != nil {
return fmt.Errorf("error validating nvidia-peermem driver installation: %w", err)
}
return nil
case "toolkit":
toolkit := &Toolkit{}
err := toolkit.validate()
Expand Down Expand Up @@ -795,9 +816,53 @@ func (d *Driver) runValidation(silent bool) (driverInfo, error) {
if err != nil {
return driverInfo{}, err
}

err = validateAdditionalDriverComponents(d.ctx)
if err != nil {
return driverInfo{}, err
}

return getDriverInfo(false, hostRootFlag, driverInstallDirFlag, driverInstallDirCtrPathFlag), nil
}

func validateAdditionalDriverComponents(ctx context.Context) error {
data, err := os.ReadFile(additionalDriversFlagsFilePath)
if err != nil {
return err
}

supportedFeatures := map[string]string{
"GDRCOPY_ENABLED": "gdrcopy",
"GDS_ENABLED": "nvidia-fs",
"GPU_DIRECT_RDMA_ENABLED": "nvidia-peermem",
}

features := map[string]bool{}
if err := yaml.Unmarshal(data, &features); err != nil {
return err
}

for k, enabled := range features {
if !enabled {
log.Debugf("%s is set to %t, skipping checking it", k, enabled)
continue
}

component, ok := supportedFeatures[k]
if !ok {
log.Infof("unsupported feature flag: %s, skipping checking it", k)
continue
}

log.Infof("Validating additional enabled driver component: %s", component)
if err := validateComponent(ctx, component); err != nil {
return err
}
}

return nil
}

func (d *Driver) validate() error {
// delete driver status file is already present
err := deleteStatusFile(outputDirFlag + "/" + driverStatusFile)
Expand Down Expand Up @@ -994,6 +1059,38 @@ func (g *GDRCopy) runValidation(silent bool) error {
return runCommand(command, args, silent)
}

func (n *NvidiaPeermem) validate() error {
// delete driver status file if already present
err := deleteStatusFile(outputDirFlag + "/" + nvidiaPeermemStatusFile)
if err != nil {
return err
}

err = n.runValidation(false)
if err != nil {
log.Info("nvidia-peermem driver is not ready")
return err
}

// create driver status file
err = createStatusFile(outputDirFlag + "/" + nvidiaPeermemStatusFile)
if err != nil {
return err
}
return nil
}

func (n *NvidiaPeermem) runValidation(silent bool) error {
// check for nvidia_peermem module to be loaded
command := shell
args := []string{"-c", "lsmod | grep -E '^nvidia_peermem\\s'"}

if withWaitFlag {
return runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent)
}
return runCommand(command, args, silent)
}

func (t *Toolkit) validate() error {
// delete status file is already present
err := deleteStatusFile(outputDirFlag + "/" + toolkitStatusFile)
Expand Down
9 changes: 9 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -3435,6 +3435,15 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
setContainerProbe(driverContainer, config.Driver.ReadinessProbe, Readiness)
}

if config.GDRCopy != nil && config.GDRCopy.IsEnabled() {
// set env indicating gdrcopy is enabled
setContainerEnv(driverContainer, GDRCopyEnabledEnvName, "true")
}
if config.GPUDirectStorage != nil && config.GPUDirectStorage.IsEnabled() {
// set env indicating gds is enabled
setContainerEnv(driverContainer, GDSEnabledEnvName, "true")
}

if config.Driver.GPUDirectRDMA != nil && config.Driver.GPUDirectRDMA.IsEnabled() {
// set env indicating nvidia-peermem is enabled to compile module with required ib_* interfaces
setContainerEnv(driverContainer, GPUDirectRDMAEnabledEnvName, "true")
Expand Down
16 changes: 16 additions & 0 deletions manifests/state-driver/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,22 @@ spec:
value: "true"
{{- end }}
{{- end }}
{{- if and (.GPUDirectRDMA) (deref .GPUDirectRDMA.Enabled) }}
- name: GPU_DIRECT_RDMA_ENABLED
value: "true"
{{- if deref .GPUDirectRDMA.UseHostMOFED }}
- name: USE_HOST_MOFED
value: "true"
{{- end }}
{{- end }}
{{- if and (.GDS) (deref .GDS.Spec.Enabled) }}
- name: GDS_ENABLED
value: "true"
{{- end }}
{{- if and (.GDRCopy) (deref .GDRCopy.Spec.Enabled) }}
- name: GDRCOPY_ENABLED
value: "true"
{{- end }}
{{- if and (.Openshift) (.Runtime.OpenshiftVersion) }}
- name: OPENSHIFT_VERSION
value: {{ .Runtime.OpenshiftVersion | quote }}
Expand Down