@@ -23,6 +23,7 @@ import (
2323 ddnvml "github.com/DataDog/datadog-agent/pkg/gpu/safenvml"
2424 gpuutil "github.com/DataDog/datadog-agent/pkg/util/gpu"
2525 "github.com/DataDog/datadog-agent/pkg/util/kernel"
26+ "github.com/DataDog/datadog-agent/pkg/util/log"
2627)
2728
2829// ErrCannotMatchDevice is returned when a device cannot be matched to a container
@@ -36,7 +37,8 @@ const (
3637
3738// HasGPUs returns true if the container has GPUs assigned to it.
3839func HasGPUs (container * workloadmeta.Container ) bool {
39- // ECS: Check GPUDeviceIDs (populated by Docker collector for ECS only)
40+ // Primary: Check GPUDeviceIDs extracted from container runtime (ECS, K8s with Docker/containerd)
41+ // This is populated by workloadmeta collectors when NVIDIA_VISIBLE_DEVICES is in container config
4042 if len (container .GPUDeviceIDs ) > 0 {
4143 return true
4244 }
@@ -61,19 +63,28 @@ func HasGPUs(container *workloadmeta.Container) bool {
6163
6264// MatchContainerDevices matches the devices assigned to a container to the list of available devices
6365// It returns a list of devices that are assigned to the container, and an error if any of the devices cannot be matched
66+ //
67+ // Priority:
68+ // 1. GPUDeviceIDs from container runtime (NVIDIA_VISIBLE_DEVICES in container config)
69+ // - Works for: ECS, Kubernetes (Docker/containerd) with standard NVIDIA device plugin
70+ // - Not available for: GKE (gVisor ignores env var), standalone Docker (runtime injection)
71+ // 2. Fallback for Kubernetes: PodResources API (ResolvedAllocatedResources)
72+ // 3. Fallback for standalone Docker: procfs (/proc/PID/environ)
6473func MatchContainerDevices (container * workloadmeta.Container , devices []ddnvml.Device ) ([]ddnvml.Device , error ) {
65- // ECS: Use GPUDeviceIDs (UUID format) extracted from container config at discovery time
66- // This is checked first because ECS uses Docker runtime but needs UUID-based matching
74+ // Primary: Use GPUDeviceIDs (UUID format) extracted from container config at discovery time
6775 if len (container .GPUDeviceIDs ) > 0 {
76+ log .Debugf ("GPU device source for container %s: runtime (NVIDIA_VISIBLE_DEVICES from config)" , container .ID )
6877 return matchByGPUDeviceIDs (container .GPUDeviceIDs , devices )
6978 }
7079
7180 switch container .Runtime {
7281 case workloadmeta .ContainerRuntimeDocker :
82+ log .Debugf ("GPU device source for container %s: procfs (/proc/%d/environ)" , container .ID , container .PID )
7383 return matchDockerDevices (container , devices )
7484 default :
7585 // We have no specific support for other runtimes, so fall back to the Kubernetes device
7686 // assignment if it's there
87+ log .Debugf ("GPU device source for container %s: pod_resources_api" , container .ID )
7788 return matchKubernetesDevices (container , devices )
7889 }
7990}
@@ -140,8 +151,9 @@ func matchDockerDevices(container *workloadmeta.Container, devices []ddnvml.Devi
140151}
141152
142153// matchByGPUDeviceIDs matches devices using GPUDeviceIDs from workloadmeta.
143- // This is used for ECS containers where GPU UUIDs are provided in the format
144- // "GPU-aec058b1-c18e-236e-c14d-49d2990fda0f".
154+ // This is used for containers where GPU UUIDs are extracted from NVIDIA_VISIBLE_DEVICES
155+ // in the container config (ECS, Kubernetes with Docker/containerd).
156+ // Format: "GPU-aec058b1-c18e-236e-c14d-49d2990fda0f" or comma-separated UUIDs.
145157// Special values:
146158// - "all" returns all available devices (GPU sharing)
147159// - "none", "void" returns empty slice (no GPU access)
0 commit comments