Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/docs/howto/cluster-setup/bare-metal.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ helm install --wait --generate-name \
--set sandboxDevicePlugin.repository=ghcr.io/nvidia \
--set sandboxDevicePlugin.image=nvidia-sandbox-device-plugin \
--set sandboxDevicePlugin.version=8e76fe81 \
--set 'sandboxDevicePlugin.env[0].name=P_GPU_ALIAS' \
--set 'sandboxDevicePlugin.env[0].value=pgpu' \
--set nfd.enabled=true \
--set nfd.nodefeaturerules=true
```
Expand Down Expand Up @@ -296,7 +298,7 @@ The above command should yield an output similar to the following, depending on

```json
{
"nvidia.com/GH100_H100_PCIE": "1"
"nvidia.com/pgpu": "1"
}
```

Expand Down
2 changes: 1 addition & 1 deletion docs/docs/howto/workload-deployment/GPU-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ spec:
- # ...
resources:
limits:
"nvidia.com/GH100_H100_PCIE": 1
"nvidia.com/pgpu": 1
env:
# ...
- name: NVIDIA_VISIBLE_DEVICES
Expand Down
6 changes: 2 additions & 4 deletions e2e/gpu/gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,17 @@ func TestGPU(t *testing.T) {
runtimeHandler, err := manifest.RuntimeHandler(platform)
require.NoError(t, err)

var deviceURI, gpuName string
var gpuName string
switch platform {
case platforms.MetalQEMUTDXGPU:
deviceURI = "nvidia.com/GB100_B200"
gpuName = "NVIDIA B200"
case platforms.MetalQEMUSNPGPU:
deviceURI = "nvidia.com/GH100_H100_PCIE"
gpuName = "NVIDIA H100 PCIe"
default:
t.Errorf("platform %s does not support GPU tests", platform)
}

resources := kuberesource.GPU(deviceURI)
resources := kuberesource.GPU()

// Since the TDX-GPU testing cluster has multiple GPUs, we run into the drift
// explained in [1]. To avoid this, we need to remove the deployment of the direct GPU tester,
Expand Down
2 changes: 1 addition & 1 deletion internal/kuberesource/resourcegen/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func main() {
case "vault":
subResources = kuberesource.PatchRuntimeHandlers(kuberesource.Vault(*namespace), "contrast-cc")
case "gpu":
subResources = kuberesource.PatchRuntimeHandlers(kuberesource.GPU("placeholder-gpu"), "contrast-cc")
subResources = kuberesource.PatchRuntimeHandlers(kuberesource.GPU(), "contrast-cc")
default:
log.Fatalf("Error: unknown set: %s\n", set)
}
Expand Down
4 changes: 2 additions & 2 deletions internal/kuberesource/sets.go
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ done
}

// GPU returns the resources for deploying a GPU test pod.
func GPU(deviceURI string) []any {
func GPU() []any {
tester := Deployment("gpu-tester", "").
WithSpec(DeploymentSpec().
WithReplicas(1).
Expand All @@ -679,7 +679,7 @@ func GPU(deviceURI string) []any {
WithResources(ResourceRequirements().
WithMemoryLimitAndRequest(500). // This accounts for nvidia-smi and the guest pull overhead.
WithLimits(corev1.ResourceList{
corev1.ResourceName(deviceURI): resource.MustParse("1"),
corev1.ResourceName("nvidia.com/pgpu"): resource.MustParse("1"),
}),
),
Container().
Expand Down
5 changes: 2 additions & 3 deletions packages/upgrade-gpu-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ kubectl delete crd nvidiadrivers.nvidia.com --ignore-not-found
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update

# Upstream instructions from https://github.com/kata-containers/kata-containers/pull/12257
# The P_GPU_ALIAS environment variable is stripped to get per-GPU annotations for heterogenous
# clusters. With it, all GPUs will look like this to CDI clients: `nvidia.com/pgpu`. Without
# it, the annotations are specific to the GPU type: `nvidia.com/GB100_B200`.
helm install --wait --generate-name \
-n gpu-operator --create-namespace \
nvidia/gpu-operator \
Expand All @@ -72,6 +69,8 @@ helm install --wait --generate-name \
--set sandboxDevicePlugin.repository=ghcr.io/nvidia \
--set sandboxDevicePlugin.image=nvidia-sandbox-device-plugin \
--set sandboxDevicePlugin.version=8e76fe81 \
--set 'sandboxDevicePlugin.env[0].name=P_GPU_ALIAS' \
--set 'sandboxDevicePlugin.env[0].value=pgpu' \
--set nfd.enabled=true \
--set nfd.nodefeaturerules=true

Expand Down