Extract GPU device id from container runtime

zhuminyi · zhuminyi · commit 7d74be67993c · 2026-01-15T16:14:21.000-05:00
diff --git a/comp/core/workloadmeta/collectors/internal/containerd/container_builder.go b/comp/core/workloadmeta/collectors/internal/containerd/container_builder.go
@@ -154,6 +154,7 @@ func buildWorkloadMetaContainer(namespace string, container containerd.Container
 
 		workloadContainer.EnvVars = envs
 		workloadContainer.Hostname = spec.Hostname
+		workloadContainer.GPUDeviceIDs = util.ExtractGPUDeviceIDsFromEnvMap(envs)
 		if spec.Linux != nil {
 			workloadContainer.CgroupPath = extractCgroupPath(spec.Linux.CgroupsPath)
 		}
diff --git a/comp/core/workloadmeta/collectors/internal/docker/docker.go b/comp/core/workloadmeta/collectors/internal/docker/docker.go
@@ -45,13 +45,6 @@ import (
 const (
 	collectorID   = "docker"
 	componentName = "workloadmeta-docker"
-
-	// nvidiaVisibleDevicesEnvVar is the environment variable set by NVIDIA container runtime
-	// to specify which GPUs are visible to the container. Values can be:
-	// - GPU UUIDs: "GPU-uuid" or "GPU-uuid1,GPU-uuid2" (ECS, some K8s setups)
-	// - Device indices: "0", "1", "0,1" (local Docker)
-	// - Special values: "all", "none", "void"
-	nvidiaVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES"
 )
 
 // imageEventActionSbom is an event that we set to create a fake docker event.
@@ -336,7 +329,7 @@ func (c *collector) buildCollectorEvent(ctx context.Context, ev *docker.Containe
 			PID:          container.State.Pid,
 			RestartCount: container.RestartCount,
 			Resources:    extractResources(container),
-			GPUDeviceIDs: extractGPUDeviceIDsForECS(container.Config.Env),
+			GPUDeviceIDs: util.ExtractGPUDeviceIDsFromEnvVars(container.Config.Env),
 		}
 
 	case events.ActionDie, docker.ActionDied:
@@ -747,39 +740,6 @@ func layersFromDockerHistoryAndInspect(history []image.HistoryResponseItem, insp
 	return layers
 }
 
-// extractGPUDeviceIDsForECS extracts GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable,
-// but ONLY when running in ECS. For regular Docker containers, the NVIDIA container toolkit adds
-// NVIDIA_VISIBLE_DEVICES in a way that's not visible in container.Config.Env (it's added by the
-// runtime, not the container config), so we must rely on reading from procfs at metric collection time.
-// In ECS, the env var IS visible in container.Config.Env because ECS sets it directly.
-// ECS typically sets GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), but users can also set "all" for GPU sharing.
-func extractGPUDeviceIDsForECS(envVars []string) []string {
-	// Only extract from container config in ECS.
-	// For regular Docker, NVIDIA_VISIBLE_DEVICES is added by the container runtime
-	// and won't be visible here - the GPU probe will read it from procfs instead.
-	if !env.IsECS() {
-		return nil
-	}
-	return extractGPUDeviceIDs(envVars)
-}
-
-// extractGPUDeviceIDs parses GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable.
-// ECS typically sets GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), but users can also set "all" for GPU sharing.
-// Special values "all", "none", "void" are preserved and handled in matchByGPUDeviceIDs().
-// Empty value returns nil (env var set but empty).
-func extractGPUDeviceIDs(envVars []string) []string {
-	prefix := nvidiaVisibleDevicesEnvVar + "="
-	for _, e := range envVars {
-		if value, found := strings.CutPrefix(e, prefix); found {
-			if value == "" {
-				return nil
-			}
-			return strings.Split(value, ",")
-		}
-	}
-	return nil
-}
-
 func extractResources(container container.InspectResponse) workloadmeta.ContainerResources {
 	var resources workloadmeta.ContainerResources
 
diff --git a/comp/core/workloadmeta/collectors/internal/docker/docker_test.go b/comp/core/workloadmeta/collectors/internal/docker/docker_test.go
@@ -18,62 +18,6 @@ import (
 	workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
 )
 
-func Test_extractGPUDeviceIDs(t *testing.T) {
-	tests := []struct {
-		name     string
-		envVars  []string
-		expected []string
-	}{
-		{
-			name:     "single GPU UUID",
-			envVars:  []string{"PATH=/usr/bin", "NVIDIA_VISIBLE_DEVICES=GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
-			expected: []string{"GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
-		},
-		{
-			name:     "multiple GPU UUIDs",
-			envVars:  []string{"NVIDIA_VISIBLE_DEVICES=GPU-aec058b1-c18e-236e-c14d-49d2990fda0f,GPU-bec058b1-d18e-336e-d14d-59d2990fda1f"},
-			expected: []string{"GPU-aec058b1-c18e-236e-c14d-49d2990fda0f", "GPU-bec058b1-d18e-336e-d14d-59d2990fda1f"},
-		},
-		{
-			name:     "all GPUs",
-			envVars:  []string{"NVIDIA_VISIBLE_DEVICES=all"},
-			expected: []string{"all"},
-		},
-		{
-			name:     "none",
-			envVars:  []string{"NVIDIA_VISIBLE_DEVICES=none"},
-			expected: []string{"none"},
-		},
-		{
-			name:     "void",
-			envVars:  []string{"NVIDIA_VISIBLE_DEVICES=void"},
-			expected: []string{"void"},
-		},
-		{
-			name:     "empty value",
-			envVars:  []string{"NVIDIA_VISIBLE_DEVICES="},
-			expected: nil,
-		},
-		{
-			name:     "no NVIDIA_VISIBLE_DEVICES",
-			envVars:  []string{"PATH=/usr/bin", "HOME=/root"},
-			expected: nil,
-		},
-		{
-			name:     "empty env vars",
-			envVars:  []string{},
-			expected: nil,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := extractGPUDeviceIDs(tt.envVars)
-			assert.Equal(t, tt.expected, result)
-		})
-	}
-}
-
 func Test_LayersFromDockerHistoryAndInspect(t *testing.T) {
 	var emptySize int64
 	var noDiffCmd = "ENV var=dummy"
diff --git a/pkg/gpu/containers/containers.go b/pkg/gpu/containers/containers.go
@@ -23,6 +23,7 @@ import (
 	ddnvml "github.com/DataDog/datadog-agent/pkg/gpu/safenvml"
 	gpuutil "github.com/DataDog/datadog-agent/pkg/util/gpu"
 	"github.com/DataDog/datadog-agent/pkg/util/kernel"
+	"github.com/DataDog/datadog-agent/pkg/util/log"
 )
 
 // ErrCannotMatchDevice is returned when a device cannot be matched to a container
@@ -36,7 +37,8 @@ const (
 
 // HasGPUs returns true if the container has GPUs assigned to it.
 func HasGPUs(container *workloadmeta.Container) bool {
-	// ECS: Check GPUDeviceIDs (populated by Docker collector for ECS only)
+	// Primary: Check GPUDeviceIDs extracted from container runtime (ECS, K8s with Docker/containerd)
+	// This is populated by workloadmeta collectors when NVIDIA_VISIBLE_DEVICES is in container config
 	if len(container.GPUDeviceIDs) > 0 {
 		return true
 	}
@@ -61,19 +63,28 @@ func HasGPUs(container *workloadmeta.Container) bool {
 
 // MatchContainerDevices matches the devices assigned to a container to the list of available devices
 // It returns a list of devices that are assigned to the container, and an error if any of the devices cannot be matched
+//
+// Priority:
+//  1. GPUDeviceIDs from container runtime (NVIDIA_VISIBLE_DEVICES in container config)
+//     - Works for: ECS, Kubernetes (Docker/containerd) with standard NVIDIA device plugin
+//     - Not available for: GKE (gVisor ignores env var), standalone Docker (runtime injection)
+//  2. Fallback for Kubernetes: PodResources API (ResolvedAllocatedResources)
+//  3. Fallback for standalone Docker: procfs (/proc/PID/environ)
 func MatchContainerDevices(container *workloadmeta.Container, devices []ddnvml.Device) ([]ddnvml.Device, error) {
-	// ECS: Use GPUDeviceIDs (UUID format) extracted from container config at discovery time
-	// This is checked first because ECS uses Docker runtime but needs UUID-based matching
+	// Primary: Use GPUDeviceIDs (UUID format) extracted from container config at discovery time
 	if len(container.GPUDeviceIDs) > 0 {
+		log.Debugf("GPU device source for container %s: runtime (NVIDIA_VISIBLE_DEVICES from config)", container.ID)
 		return matchByGPUDeviceIDs(container.GPUDeviceIDs, devices)
 	}
 
 	switch container.Runtime {
 	case workloadmeta.ContainerRuntimeDocker:
+		log.Debugf("GPU device source for container %s: procfs (/proc/%d/environ)", container.ID, container.PID)
 		return matchDockerDevices(container, devices)
 	default:
 		// We have no specific support for other runtimes, so fall back to the Kubernetes device
 		// assignment if it's there
+		log.Debugf("GPU device source for container %s: pod_resources_api", container.ID)
 		return matchKubernetesDevices(container, devices)
 	}
 }
@@ -140,8 +151,9 @@ func matchDockerDevices(container *workloadmeta.Container, devices []ddnvml.Devi
 }
 
 // matchByGPUDeviceIDs matches devices using GPUDeviceIDs from workloadmeta.
-// This is used for ECS containers where GPU UUIDs are provided in the format
-// "GPU-aec058b1-c18e-236e-c14d-49d2990fda0f".
+// This is used for containers where GPU UUIDs are extracted from NVIDIA_VISIBLE_DEVICES
+// in the container config (ECS, Kubernetes with Docker/containerd).
+// Format: "GPU-aec058b1-c18e-236e-c14d-49d2990fda0f" or comma-separated UUIDs.
 // Special values:
 //   - "all" returns all available devices (GPU sharing)
 //   - "none", "void" returns empty slice (no GPU access)

Original file line number	Diff line number	Diff line change
`@@ -154,6 +154,7 @@ func buildWorkloadMetaContainer(namespace string, container containerd.Container`
`154`	`154`
`155`	`155`	`workloadContainer.EnvVars = envs`
`156`	`156`	`workloadContainer.Hostname = spec.Hostname`
	`157`	`+ workloadContainer.GPUDeviceIDs = util.ExtractGPUDeviceIDsFromEnvMap(envs)`
`157`	`158`	`if spec.Linux != nil {`
`158`	`159`	`workloadContainer.CgroupPath = extractCgroupPath(spec.Linux.CgroupsPath)`
`159`	`160`	`}`