Skip to content

Commit 7d74be6

Browse files
committed
Extract GPU device id from container runtime
1 parent ff0abbe commit 7d74be6

File tree

4 files changed

+19
-102
lines changed

4 files changed

+19
-102
lines changed

comp/core/workloadmeta/collectors/internal/containerd/container_builder.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ func buildWorkloadMetaContainer(namespace string, container containerd.Container
154154

155155
workloadContainer.EnvVars = envs
156156
workloadContainer.Hostname = spec.Hostname
157+
workloadContainer.GPUDeviceIDs = util.ExtractGPUDeviceIDsFromEnvMap(envs)
157158
if spec.Linux != nil {
158159
workloadContainer.CgroupPath = extractCgroupPath(spec.Linux.CgroupsPath)
159160
}

comp/core/workloadmeta/collectors/internal/docker/docker.go

Lines changed: 1 addition & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,6 @@ import (
4545
const (
4646
collectorID = "docker"
4747
componentName = "workloadmeta-docker"
48-
49-
// nvidiaVisibleDevicesEnvVar is the environment variable set by NVIDIA container runtime
50-
// to specify which GPUs are visible to the container. Values can be:
51-
// - GPU UUIDs: "GPU-uuid" or "GPU-uuid1,GPU-uuid2" (ECS, some K8s setups)
52-
// - Device indices: "0", "1", "0,1" (local Docker)
53-
// - Special values: "all", "none", "void"
54-
nvidiaVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES"
5548
)
5649

5750
// imageEventActionSbom is an event that we set to create a fake docker event.
@@ -336,7 +329,7 @@ func (c *collector) buildCollectorEvent(ctx context.Context, ev *docker.Containe
336329
PID: container.State.Pid,
337330
RestartCount: container.RestartCount,
338331
Resources: extractResources(container),
339-
GPUDeviceIDs: extractGPUDeviceIDsForECS(container.Config.Env),
332+
GPUDeviceIDs: util.ExtractGPUDeviceIDsFromEnvVars(container.Config.Env),
340333
}
341334

342335
case events.ActionDie, docker.ActionDied:
@@ -747,39 +740,6 @@ func layersFromDockerHistoryAndInspect(history []image.HistoryResponseItem, insp
747740
return layers
748741
}
749742

750-
// extractGPUDeviceIDsForECS extracts GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable,
751-
// but ONLY when running in ECS. For regular Docker containers, the NVIDIA container toolkit adds
752-
// NVIDIA_VISIBLE_DEVICES in a way that's not visible in container.Config.Env (it's added by the
753-
// runtime, not the container config), so we must rely on reading from procfs at metric collection time.
754-
// In ECS, the env var IS visible in container.Config.Env because ECS sets it directly.
755-
// ECS typically sets GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), but users can also set "all" for GPU sharing.
756-
func extractGPUDeviceIDsForECS(envVars []string) []string {
757-
// Only extract from container config in ECS.
758-
// For regular Docker, NVIDIA_VISIBLE_DEVICES is added by the container runtime
759-
// and won't be visible here - the GPU probe will read it from procfs instead.
760-
if !env.IsECS() {
761-
return nil
762-
}
763-
return extractGPUDeviceIDs(envVars)
764-
}
765-
766-
// extractGPUDeviceIDs parses GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable.
767-
// ECS typically sets GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), but users can also set "all" for GPU sharing.
768-
// Special values "all", "none", "void" are preserved and handled in matchByGPUDeviceIDs().
769-
// Empty value returns nil (env var set but empty).
770-
func extractGPUDeviceIDs(envVars []string) []string {
771-
prefix := nvidiaVisibleDevicesEnvVar + "="
772-
for _, e := range envVars {
773-
if value, found := strings.CutPrefix(e, prefix); found {
774-
if value == "" {
775-
return nil
776-
}
777-
return strings.Split(value, ",")
778-
}
779-
}
780-
return nil
781-
}
782-
783743
func extractResources(container container.InspectResponse) workloadmeta.ContainerResources {
784744
var resources workloadmeta.ContainerResources
785745

comp/core/workloadmeta/collectors/internal/docker/docker_test.go

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -18,62 +18,6 @@ import (
1818
workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
1919
)
2020

21-
func Test_extractGPUDeviceIDs(t *testing.T) {
22-
tests := []struct {
23-
name string
24-
envVars []string
25-
expected []string
26-
}{
27-
{
28-
name: "single GPU UUID",
29-
envVars: []string{"PATH=/usr/bin", "NVIDIA_VISIBLE_DEVICES=GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
30-
expected: []string{"GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
31-
},
32-
{
33-
name: "multiple GPU UUIDs",
34-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=GPU-aec058b1-c18e-236e-c14d-49d2990fda0f,GPU-bec058b1-d18e-336e-d14d-59d2990fda1f"},
35-
expected: []string{"GPU-aec058b1-c18e-236e-c14d-49d2990fda0f", "GPU-bec058b1-d18e-336e-d14d-59d2990fda1f"},
36-
},
37-
{
38-
name: "all GPUs",
39-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=all"},
40-
expected: []string{"all"},
41-
},
42-
{
43-
name: "none",
44-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=none"},
45-
expected: []string{"none"},
46-
},
47-
{
48-
name: "void",
49-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=void"},
50-
expected: []string{"void"},
51-
},
52-
{
53-
name: "empty value",
54-
envVars: []string{"NVIDIA_VISIBLE_DEVICES="},
55-
expected: nil,
56-
},
57-
{
58-
name: "no NVIDIA_VISIBLE_DEVICES",
59-
envVars: []string{"PATH=/usr/bin", "HOME=/root"},
60-
expected: nil,
61-
},
62-
{
63-
name: "empty env vars",
64-
envVars: []string{},
65-
expected: nil,
66-
},
67-
}
68-
69-
for _, tt := range tests {
70-
t.Run(tt.name, func(t *testing.T) {
71-
result := extractGPUDeviceIDs(tt.envVars)
72-
assert.Equal(t, tt.expected, result)
73-
})
74-
}
75-
}
76-
7721
func Test_LayersFromDockerHistoryAndInspect(t *testing.T) {
7822
var emptySize int64
7923
var noDiffCmd = "ENV var=dummy"

pkg/gpu/containers/containers.go

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
ddnvml "github.com/DataDog/datadog-agent/pkg/gpu/safenvml"
2424
gpuutil "github.com/DataDog/datadog-agent/pkg/util/gpu"
2525
"github.com/DataDog/datadog-agent/pkg/util/kernel"
26+
"github.com/DataDog/datadog-agent/pkg/util/log"
2627
)
2728

2829
// ErrCannotMatchDevice is returned when a device cannot be matched to a container
@@ -36,7 +37,8 @@ const (
3637

3738
// HasGPUs returns true if the container has GPUs assigned to it.
3839
func HasGPUs(container *workloadmeta.Container) bool {
39-
// ECS: Check GPUDeviceIDs (populated by Docker collector for ECS only)
40+
// Primary: Check GPUDeviceIDs extracted from container runtime (ECS, K8s with Docker/containerd)
41+
// This is populated by workloadmeta collectors when NVIDIA_VISIBLE_DEVICES is in container config
4042
if len(container.GPUDeviceIDs) > 0 {
4143
return true
4244
}
@@ -61,19 +63,28 @@ func HasGPUs(container *workloadmeta.Container) bool {
6163

6264
// MatchContainerDevices matches the devices assigned to a container to the list of available devices
6365
// It returns a list of devices that are assigned to the container, and an error if any of the devices cannot be matched
66+
//
67+
// Priority:
68+
// 1. GPUDeviceIDs from container runtime (NVIDIA_VISIBLE_DEVICES in container config)
69+
// - Works for: ECS, Kubernetes (Docker/containerd) with standard NVIDIA device plugin
70+
// - Not available for: GKE (gVisor ignores env var), standalone Docker (runtime injection)
71+
// 2. Fallback for Kubernetes: PodResources API (ResolvedAllocatedResources)
72+
// 3. Fallback for standalone Docker: procfs (/proc/PID/environ)
6473
func MatchContainerDevices(container *workloadmeta.Container, devices []ddnvml.Device) ([]ddnvml.Device, error) {
65-
// ECS: Use GPUDeviceIDs (UUID format) extracted from container config at discovery time
66-
// This is checked first because ECS uses Docker runtime but needs UUID-based matching
74+
// Primary: Use GPUDeviceIDs (UUID format) extracted from container config at discovery time
6775
if len(container.GPUDeviceIDs) > 0 {
76+
log.Debugf("GPU device source for container %s: runtime (NVIDIA_VISIBLE_DEVICES from config)", container.ID)
6877
return matchByGPUDeviceIDs(container.GPUDeviceIDs, devices)
6978
}
7079

7180
switch container.Runtime {
7281
case workloadmeta.ContainerRuntimeDocker:
82+
log.Debugf("GPU device source for container %s: procfs (/proc/%d/environ)", container.ID, container.PID)
7383
return matchDockerDevices(container, devices)
7484
default:
7585
// We have no specific support for other runtimes, so fall back to the Kubernetes device
7686
// assignment if it's there
87+
log.Debugf("GPU device source for container %s: pod_resources_api", container.ID)
7788
return matchKubernetesDevices(container, devices)
7889
}
7990
}
@@ -140,8 +151,9 @@ func matchDockerDevices(container *workloadmeta.Container, devices []ddnvml.Devi
140151
}
141152

142153
// matchByGPUDeviceIDs matches devices using GPUDeviceIDs from workloadmeta.
143-
// This is used for ECS containers where GPU UUIDs are provided in the format
144-
// "GPU-aec058b1-c18e-236e-c14d-49d2990fda0f".
154+
// This is used for containers where GPU UUIDs are extracted from NVIDIA_VISIBLE_DEVICES
155+
// in the container config (ECS, Kubernetes with Docker/containerd).
156+
// Format: "GPU-aec058b1-c18e-236e-c14d-49d2990fda0f" or comma-separated UUIDs.
145157
// Special values:
146158
// - "all" returns all available devices (GPU sharing)
147159
// - "none", "void" returns empty slice (no GPU access)

0 commit comments

Comments
 (0)