|
| 1 | +// Unless explicitly stated otherwise all files in this repository are licensed |
| 2 | +// under the Apache License Version 2.0. |
| 3 | +// This product includes software developed at Datadog (https://www.datadoghq.com/). |
| 4 | +// Copyright 2016-present Datadog, Inc. |
| 5 | + |
| 6 | +package util |
| 7 | + |
| 8 | +import ( |
| 9 | + "regexp" |
| 10 | + "strings" |
| 11 | + |
| 12 | + "github.com/DataDog/datadog-agent/pkg/config/env" |
| 13 | +) |
| 14 | + |
| 15 | +// gpuUUIDRegex matches valid NVIDIA GPU and MIG device UUID formats. |
| 16 | +// |
| 17 | +// Supported formats: |
| 18 | +// - GPU UUID: GPU-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx (physical GPU) |
| 19 | +// - MIG UUID (modern): MIG-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx |
| 20 | +// - MIG UUID (legacy): MIG-GPU-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/gi/ci |
| 21 | +// |
| 22 | +// The UUID portion follows the standard 8-4-4-4-12 hexadecimal format. |
| 23 | +// |
| 24 | +// References: |
| 25 | +// - NVML API (nvmlDeviceGetUUID): https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html |
| 26 | +// - NVIDIA Container Toolkit: https://github.com/NVIDIA/nvidia-container-toolkit/blob/main/cmd/nvidia-container-runtime/README.md |
| 27 | +// - MIG User Guide: https://docs.nvidia.com/datacenter/tesla/mig-user-guide/ |
| 28 | +// |
| 29 | +// This regex is used to validate that NVIDIA_VISIBLE_DEVICES contains actual GPU UUIDs |
| 30 | +// set by the NVIDIA device plugin, rather than user overrides like "all", "none", or indices. |
| 31 | +var gpuUUIDRegex = regexp.MustCompile( |
| 32 | + `^(?:` + |
| 33 | + `GPU-[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}|` + // Physical GPU |
| 34 | + `MIG-[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}|` + // Modern MIG format |
| 35 | + `MIG-GPU-[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}/\d+/\d+` + // Legacy MIG format |
| 36 | + `)$`, |
| 37 | +) |
| 38 | + |
| 39 | +// IsGPUUUID returns true if the value is a valid NVIDIA GPU or MIG device UUID. |
| 40 | +// This distinguishes device plugin-assigned UUIDs from user overrides like "all", "none", or indices. |
| 41 | +func IsGPUUUID(value string) bool { |
| 42 | + return gpuUUIDRegex.MatchString(value) |
| 43 | +} |
| 44 | + |
| 45 | +// areAllValidGPUUUIDs returns true if all values in the slice are valid GPU UUIDs. |
| 46 | +// Returns false if the slice is empty or contains any non-UUID value (e.g., "all", "none", "0"). |
| 47 | +// |
| 48 | +// This is used to detect user overrides of NVIDIA_VISIBLE_DEVICES. The NVIDIA device plugin |
| 49 | +// always returns a list of UUIDs, never mixed with special values. If we see any non-UUID value, |
| 50 | +// it indicates user override and we should fall back to PodResources API. |
| 51 | +func areAllValidGPUUUIDs(values []string) bool { |
| 52 | + if len(values) == 0 { |
| 53 | + return false |
| 54 | + } |
| 55 | + for _, v := range values { |
| 56 | + if !IsGPUUUID(v) { |
| 57 | + return false |
| 58 | + } |
| 59 | + } |
| 60 | + return true |
| 61 | +} |
| 62 | + |
| 63 | +// NVIDIAVisibleDevicesEnvVar is the environment variable set by NVIDIA container runtime |
| 64 | +// or Kubernetes device plugin to specify which GPUs are visible to the container. |
| 65 | +// Values can be GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), special values like "all", "none", "void", |
| 66 | +// or MIG instance identifiers. |
| 67 | +const NVIDIAVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES" |
| 68 | + |
| 69 | +// ExtractGPUDeviceIDs parses GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable. |
| 70 | +// This is the core parsing function that extracts GPU IDs from a list of environment variables. |
| 71 | +// |
| 72 | +// Returns: |
| 73 | +// - GPU UUIDs as a slice (e.g., ["GPU-uuid1", "GPU-uuid2"]) |
| 74 | +// - Special values like ["all"], ["none"], ["void"] are preserved |
| 75 | +// - nil if the env var is not set or has an empty value |
| 76 | +func ExtractGPUDeviceIDs(envVars []string) []string { |
| 77 | + prefix := NVIDIAVisibleDevicesEnvVar + "=" |
| 78 | + for _, e := range envVars { |
| 79 | + if value, found := strings.CutPrefix(e, prefix); found { |
| 80 | + if value == "" { |
| 81 | + return nil |
| 82 | + } |
| 83 | + return strings.Split(value, ",") |
| 84 | + } |
| 85 | + } |
| 86 | + return nil |
| 87 | +} |
| 88 | + |
| 89 | +// ShouldExtractGPUDeviceIDsFromConfig returns true if GPU device IDs should be extracted |
| 90 | +// from container config/spec based on the current environment. |
| 91 | +// |
| 92 | +// Supported environments: |
| 93 | +// - ECS: env var set by ECS agent in task definition |
| 94 | +// - Kubernetes (non-GKE): env var set by NVIDIA device plugin via Allocate() API |
| 95 | +// |
| 96 | +// Not supported (returns false): |
| 97 | +// - Docker standalone: env var injected at runtime by nvidia-container-toolkit, not in config |
| 98 | +// - Standalone containerd: env var may be injected at runtime, not in spec |
| 99 | +// - GKE: uses custom device plugin + gVisor runtime that ignores NVIDIA_VISIBLE_DEVICES |
| 100 | +// |
| 101 | +// Note: GKE is not explicitly detected here. On GKE, the env var is either not set in a useful way |
| 102 | +// or ignored by the runtime, so extraction returns nil and falls back to PodResources API. |
| 103 | +func ShouldExtractGPUDeviceIDsFromConfig() bool { |
| 104 | + return env.IsECS() || env.IsKubernetes() |
| 105 | +} |
| 106 | + |
| 107 | +// ExtractGPUDeviceIDsFromEnvVars extracts GPU device IDs from environment variables |
| 108 | +// if the current environment supports it. This combines the environment check |
| 109 | +// with the extraction logic for convenience. |
| 110 | +// |
| 111 | +// In Kubernetes environments, this function validates that all extracted values are valid GPU UUIDs. |
| 112 | +// If any non-UUID value is detected (e.g., "all", "none", "0"), it returns nil to indicate |
| 113 | +// a potential user override, allowing the caller to fall back to PodResources API. |
| 114 | +// |
| 115 | +// Use this function when you have a list of environment variable strings (e.g., from container config). |
| 116 | +func ExtractGPUDeviceIDsFromEnvVars(envVars []string) []string { |
| 117 | + if !ShouldExtractGPUDeviceIDsFromConfig() { |
| 118 | + return nil |
| 119 | + } |
| 120 | + ids := ExtractGPUDeviceIDs(envVars) |
| 121 | + // In Kubernetes, validate UUIDs to detect user overrides |
| 122 | + // ECS is excluded because users cannot override env vars set by ECS agent |
| 123 | + if env.IsKubernetes() && !areAllValidGPUUUIDs(ids) { |
| 124 | + return nil |
| 125 | + } |
| 126 | + return ids |
| 127 | +} |
| 128 | + |
| 129 | +// ExtractGPUDeviceIDsFromEnvMap extracts GPU device IDs from an environment variable map |
| 130 | +// if the current environment supports it. |
| 131 | +// |
| 132 | +// In Kubernetes environments, this function validates that all extracted values are valid GPU UUIDs. |
| 133 | +// If any non-UUID value is detected (e.g., "all", "none", "0"), it returns nil to indicate |
| 134 | +// a potential user override, allowing the caller to fall back to PodResources API. |
| 135 | +// |
| 136 | +// Use this function when you have a map of environment variables (e.g., from containerd spec parsing). |
| 137 | +func ExtractGPUDeviceIDsFromEnvMap(envs map[string]string) []string { |
| 138 | + if !ShouldExtractGPUDeviceIDsFromConfig() { |
| 139 | + return nil |
| 140 | + } |
| 141 | + if val, ok := envs[NVIDIAVisibleDevicesEnvVar]; ok && val != "" { |
| 142 | + ids := strings.Split(val, ",") |
| 143 | + // In Kubernetes, validate UUIDs to detect user overrides |
| 144 | + // ECS is excluded because users cannot override env vars set by ECS agent |
| 145 | + if env.IsKubernetes() && !areAllValidGPUUUIDs(ids) { |
| 146 | + return nil |
| 147 | + } |
| 148 | + return ids |
| 149 | + } |
| 150 | + return nil |
| 151 | +} |
0 commit comments