diff --git a/docs/docs/howto/cluster-setup/bare-metal.md b/docs/docs/howto/cluster-setup/bare-metal.md index 33a75fc3c9..f6663db364 100644 --- a/docs/docs/howto/cluster-setup/bare-metal.md +++ b/docs/docs/howto/cluster-setup/bare-metal.md @@ -205,6 +205,8 @@ helm install --wait --generate-name \ --set sandboxDevicePlugin.repository=ghcr.io/nvidia \ --set sandboxDevicePlugin.image=nvidia-sandbox-device-plugin \ --set sandboxDevicePlugin.version=8e76fe81 \ + --set 'sandboxDevicePlugin.env[0].name=P_GPU_ALIAS' \ + --set 'sandboxDevicePlugin.env[0].value=pgpu' \ --set nfd.enabled=true \ --set nfd.nodefeaturerules=true ``` @@ -296,7 +298,7 @@ The above command should yield an output similar to the following, depending on ```json { - "nvidia.com/GH100_H100_PCIE": "1" + "nvidia.com/pgpu": "1" } ``` diff --git a/docs/docs/howto/workload-deployment/GPU-configuration.md b/docs/docs/howto/workload-deployment/GPU-configuration.md index 9f871d33f2..c2a08fcba9 100644 --- a/docs/docs/howto/workload-deployment/GPU-configuration.md +++ b/docs/docs/howto/workload-deployment/GPU-configuration.md @@ -63,7 +63,7 @@ spec: - # ... resources: limits: - "nvidia.com/GH100_H100_PCIE": 1 + "nvidia.com/pgpu": 1 env: # ... - name: NVIDIA_VISIBLE_DEVICES diff --git a/e2e/gpu/gpu_test.go b/e2e/gpu/gpu_test.go index 80a6cb610b..9c28f3ba5e 100644 --- a/e2e/gpu/gpu_test.go +++ b/e2e/gpu/gpu_test.go @@ -42,19 +42,17 @@ func TestGPU(t *testing.T) { runtimeHandler, err := manifest.RuntimeHandler(platform) require.NoError(t, err) - var deviceURI, gpuName string + var gpuName string switch platform { case platforms.MetalQEMUTDXGPU: - deviceURI = "nvidia.com/GB100_B200" gpuName = "NVIDIA B200" case platforms.MetalQEMUSNPGPU: - deviceURI = "nvidia.com/GH100_H100_PCIE" gpuName = "NVIDIA H100 PCIe" default: t.Errorf("platform %s does not support GPU tests", platform) } - resources := kuberesource.GPU(deviceURI) + resources := kuberesource.GPU() // Since the TDX-GPU testing cluster has multiple GPUs, we run into the drift // explained in [1]. To avoid this, we need to remove the deployment of the direct GPU tester, diff --git a/internal/kuberesource/resourcegen/main.go b/internal/kuberesource/resourcegen/main.go index 971b14f642..475296d991 100644 --- a/internal/kuberesource/resourcegen/main.go +++ b/internal/kuberesource/resourcegen/main.go @@ -68,7 +68,7 @@ func main() { case "vault": subResources = kuberesource.PatchRuntimeHandlers(kuberesource.Vault(*namespace), "contrast-cc") case "gpu": - subResources = kuberesource.PatchRuntimeHandlers(kuberesource.GPU("placeholder-gpu"), "contrast-cc") + subResources = kuberesource.PatchRuntimeHandlers(kuberesource.GPU(), "contrast-cc") default: log.Fatalf("Error: unknown set: %s\n", set) } diff --git a/internal/kuberesource/sets.go b/internal/kuberesource/sets.go index e65d44a173..edb7f963de 100644 --- a/internal/kuberesource/sets.go +++ b/internal/kuberesource/sets.go @@ -658,7 +658,7 @@ done } // GPU returns the resources for deploying a GPU test pod. -func GPU(deviceURI string) []any { +func GPU() []any { tester := Deployment("gpu-tester", ""). WithSpec(DeploymentSpec(). WithReplicas(1). @@ -679,7 +679,7 @@ func GPU(deviceURI string) []any { WithResources(ResourceRequirements(). WithMemoryLimitAndRequest(500). // This accounts for nvidia-smi and the guest pull overhead. WithLimits(corev1.ResourceList{ - corev1.ResourceName(deviceURI): resource.MustParse("1"), + corev1.ResourceName("nvidia.com/pgpu"): resource.MustParse("1"), }), ), Container(). diff --git a/packages/upgrade-gpu-operator.sh b/packages/upgrade-gpu-operator.sh index df1408a806..155fd013cb 100644 --- a/packages/upgrade-gpu-operator.sh +++ b/packages/upgrade-gpu-operator.sh @@ -50,9 +50,6 @@ kubectl delete crd nvidiadrivers.nvidia.com --ignore-not-found helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update # Upstream instructions from https://github.com/kata-containers/kata-containers/pull/12257 -# The P_GPU_ALIAS environment variable is stripped to get per-GPU annotations for heterogenous -# clusters. With it, all GPUs will look like this to CDI clients: `nvidia.com/pgpu`. Without -# it, the annotations are specific to the GPU type: `nvidia.com/GB100_B200`. helm install --wait --generate-name \ -n gpu-operator --create-namespace \ nvidia/gpu-operator \ @@ -72,6 +69,8 @@ helm install --wait --generate-name \ --set sandboxDevicePlugin.repository=ghcr.io/nvidia \ --set sandboxDevicePlugin.image=nvidia-sandbox-device-plugin \ --set sandboxDevicePlugin.version=8e76fe81 \ + --set 'sandboxDevicePlugin.env[0].name=P_GPU_ALIAS' \ + --set 'sandboxDevicePlugin.env[0].value=pgpu' \ --set nfd.enabled=true \ --set nfd.nodefeaturerules=true