Skip to content

Commit 184cfcf

Browse files
committed
Extract GPU device id from container runtime
1 parent ff0abbe commit 184cfcf

File tree

7 files changed

+385
-103
lines changed

7 files changed

+385
-103
lines changed

comp/core/workloadmeta/collectors/internal/containerd/container_builder.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ func buildWorkloadMetaContainer(namespace string, container containerd.Container
154154

155155
workloadContainer.EnvVars = envs
156156
workloadContainer.Hostname = spec.Hostname
157+
workloadContainer.GPUDeviceIDs = util.ExtractGPUDeviceIDsFromEnvMap(envs)
157158
if spec.Linux != nil {
158159
workloadContainer.CgroupPath = extractCgroupPath(spec.Linux.CgroupsPath)
159160
}

comp/core/workloadmeta/collectors/internal/docker/docker.go

Lines changed: 1 addition & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,6 @@ import (
4545
const (
4646
collectorID = "docker"
4747
componentName = "workloadmeta-docker"
48-
49-
// nvidiaVisibleDevicesEnvVar is the environment variable set by NVIDIA container runtime
50-
// to specify which GPUs are visible to the container. Values can be:
51-
// - GPU UUIDs: "GPU-uuid" or "GPU-uuid1,GPU-uuid2" (ECS, some K8s setups)
52-
// - Device indices: "0", "1", "0,1" (local Docker)
53-
// - Special values: "all", "none", "void"
54-
nvidiaVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES"
5548
)
5649

5750
// imageEventActionSbom is an event that we set to create a fake docker event.
@@ -336,7 +329,7 @@ func (c *collector) buildCollectorEvent(ctx context.Context, ev *docker.Containe
336329
PID: container.State.Pid,
337330
RestartCount: container.RestartCount,
338331
Resources: extractResources(container),
339-
GPUDeviceIDs: extractGPUDeviceIDsForECS(container.Config.Env),
332+
GPUDeviceIDs: util.ExtractGPUDeviceIDsFromEnvVars(container.Config.Env),
340333
}
341334

342335
case events.ActionDie, docker.ActionDied:
@@ -747,39 +740,6 @@ func layersFromDockerHistoryAndInspect(history []image.HistoryResponseItem, insp
747740
return layers
748741
}
749742

750-
// extractGPUDeviceIDsForECS extracts GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable,
751-
// but ONLY when running in ECS. For regular Docker containers, the NVIDIA container toolkit adds
752-
// NVIDIA_VISIBLE_DEVICES in a way that's not visible in container.Config.Env (it's added by the
753-
// runtime, not the container config), so we must rely on reading from procfs at metric collection time.
754-
// In ECS, the env var IS visible in container.Config.Env because ECS sets it directly.
755-
// ECS typically sets GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), but users can also set "all" for GPU sharing.
756-
func extractGPUDeviceIDsForECS(envVars []string) []string {
757-
// Only extract from container config in ECS.
758-
// For regular Docker, NVIDIA_VISIBLE_DEVICES is added by the container runtime
759-
// and won't be visible here - the GPU probe will read it from procfs instead.
760-
if !env.IsECS() {
761-
return nil
762-
}
763-
return extractGPUDeviceIDs(envVars)
764-
}
765-
766-
// extractGPUDeviceIDs parses GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable.
767-
// ECS typically sets GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), but users can also set "all" for GPU sharing.
768-
// Special values "all", "none", "void" are preserved and handled in matchByGPUDeviceIDs().
769-
// Empty value returns nil (env var set but empty).
770-
func extractGPUDeviceIDs(envVars []string) []string {
771-
prefix := nvidiaVisibleDevicesEnvVar + "="
772-
for _, e := range envVars {
773-
if value, found := strings.CutPrefix(e, prefix); found {
774-
if value == "" {
775-
return nil
776-
}
777-
return strings.Split(value, ",")
778-
}
779-
}
780-
return nil
781-
}
782-
783743
func extractResources(container container.InspectResponse) workloadmeta.ContainerResources {
784744
var resources workloadmeta.ContainerResources
785745

comp/core/workloadmeta/collectors/internal/docker/docker_test.go

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -18,62 +18,6 @@ import (
1818
workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
1919
)
2020

21-
func Test_extractGPUDeviceIDs(t *testing.T) {
22-
tests := []struct {
23-
name string
24-
envVars []string
25-
expected []string
26-
}{
27-
{
28-
name: "single GPU UUID",
29-
envVars: []string{"PATH=/usr/bin", "NVIDIA_VISIBLE_DEVICES=GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
30-
expected: []string{"GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
31-
},
32-
{
33-
name: "multiple GPU UUIDs",
34-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=GPU-aec058b1-c18e-236e-c14d-49d2990fda0f,GPU-bec058b1-d18e-336e-d14d-59d2990fda1f"},
35-
expected: []string{"GPU-aec058b1-c18e-236e-c14d-49d2990fda0f", "GPU-bec058b1-d18e-336e-d14d-59d2990fda1f"},
36-
},
37-
{
38-
name: "all GPUs",
39-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=all"},
40-
expected: []string{"all"},
41-
},
42-
{
43-
name: "none",
44-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=none"},
45-
expected: []string{"none"},
46-
},
47-
{
48-
name: "void",
49-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=void"},
50-
expected: []string{"void"},
51-
},
52-
{
53-
name: "empty value",
54-
envVars: []string{"NVIDIA_VISIBLE_DEVICES="},
55-
expected: nil,
56-
},
57-
{
58-
name: "no NVIDIA_VISIBLE_DEVICES",
59-
envVars: []string{"PATH=/usr/bin", "HOME=/root"},
60-
expected: nil,
61-
},
62-
{
63-
name: "empty env vars",
64-
envVars: []string{},
65-
expected: nil,
66-
},
67-
}
68-
69-
for _, tt := range tests {
70-
t.Run(tt.name, func(t *testing.T) {
71-
result := extractGPUDeviceIDs(tt.envVars)
72-
assert.Equal(t, tt.expected, result)
73-
})
74-
}
75-
}
76-
7721
func Test_LayersFromDockerHistoryAndInspect(t *testing.T) {
7822
var emptySize int64
7923
var noDiffCmd = "ENV var=dummy"
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
package util
7+
8+
import (
9+
"regexp"
10+
"strings"
11+
12+
"github.com/DataDog/datadog-agent/pkg/config/env"
13+
)
14+
15+
// gpuUUIDRegex matches valid NVIDIA GPU and MIG device UUID formats.
16+
//
17+
// Supported formats:
18+
// - GPU UUID: GPU-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx (physical GPU)
19+
// - MIG UUID (modern): MIG-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
20+
// - MIG UUID (legacy): MIG-GPU-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/gi/ci
21+
//
22+
// The UUID portion follows the standard 8-4-4-4-12 hexadecimal format.
23+
//
24+
// References:
25+
// - NVML API (nvmlDeviceGetUUID): https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html
26+
// - NVIDIA Container Toolkit: https://github.com/NVIDIA/nvidia-container-toolkit/blob/main/cmd/nvidia-container-runtime/README.md
27+
// - MIG User Guide: https://docs.nvidia.com/datacenter/tesla/mig-user-guide/
28+
//
29+
// This regex is used to validate that NVIDIA_VISIBLE_DEVICES contains actual GPU UUIDs
30+
// set by the NVIDIA device plugin, rather than user overrides like "all", "none", or indices.
31+
var gpuUUIDRegex = regexp.MustCompile(
32+
`^(?:` +
33+
`GPU-[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}|` + // Physical GPU
34+
`MIG-[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}|` + // Modern MIG format
35+
`MIG-GPU-[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}/\d+/\d+` + // Legacy MIG format
36+
`)$`,
37+
)
38+
39+
// IsGPUUUID returns true if the value is a valid NVIDIA GPU or MIG device UUID.
40+
// This distinguishes device plugin-assigned UUIDs from user overrides like "all", "none", or indices.
41+
func IsGPUUUID(value string) bool {
42+
return gpuUUIDRegex.MatchString(value)
43+
}
44+
45+
// areAllValidGPUUUIDs returns true if all values in the slice are valid GPU UUIDs.
46+
// Returns false if the slice is empty or contains any non-UUID value (e.g., "all", "none", "0").
47+
//
48+
// This is used to detect user overrides of NVIDIA_VISIBLE_DEVICES. The NVIDIA device plugin
49+
// always returns a list of UUIDs, never mixed with special values. If we see any non-UUID value,
50+
// it indicates user override and we should fall back to PodResources API.
51+
func areAllValidGPUUUIDs(values []string) bool {
52+
if len(values) == 0 {
53+
return false
54+
}
55+
for _, v := range values {
56+
if !IsGPUUUID(v) {
57+
return false
58+
}
59+
}
60+
return true
61+
}
62+
63+
// NVIDIAVisibleDevicesEnvVar is the environment variable set by NVIDIA container runtime
64+
// or Kubernetes device plugin to specify which GPUs are visible to the container.
65+
// Values can be GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), special values like "all", "none", "void",
66+
// or MIG instance identifiers.
67+
const NVIDIAVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES"
68+
69+
// ExtractGPUDeviceIDs parses GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable.
70+
// This is the core parsing function that extracts GPU IDs from a list of environment variables.
71+
//
72+
// Returns:
73+
// - GPU UUIDs as a slice (e.g., ["GPU-uuid1", "GPU-uuid2"])
74+
// - Special values like ["all"], ["none"], ["void"] are preserved
75+
// - nil if the env var is not set or has an empty value
76+
func ExtractGPUDeviceIDs(envVars []string) []string {
77+
prefix := NVIDIAVisibleDevicesEnvVar + "="
78+
for _, e := range envVars {
79+
if value, found := strings.CutPrefix(e, prefix); found {
80+
if value == "" {
81+
return nil
82+
}
83+
return strings.Split(value, ",")
84+
}
85+
}
86+
return nil
87+
}
88+
89+
// ShouldExtractGPUDeviceIDsFromConfig returns true if GPU device IDs should be extracted
90+
// from container config/spec based on the current environment.
91+
//
92+
// Supported environments:
93+
// - ECS: env var set by ECS agent in task definition
94+
// - Kubernetes (non-GKE): env var set by NVIDIA device plugin via Allocate() API
95+
//
96+
// Not supported (returns false):
97+
// - Docker standalone: env var injected at runtime by nvidia-container-toolkit, not in config
98+
// - Standalone containerd: env var may be injected at runtime, not in spec
99+
// - GKE: uses custom device plugin + gVisor runtime that ignores NVIDIA_VISIBLE_DEVICES
100+
//
101+
// Note: GKE is not explicitly detected here. On GKE, the env var is either not set in a useful way
102+
// or ignored by the runtime, so extraction returns nil and falls back to PodResources API.
103+
func ShouldExtractGPUDeviceIDsFromConfig() bool {
104+
return env.IsECS() || env.IsKubernetes()
105+
}
106+
107+
// ExtractGPUDeviceIDsFromEnvVars extracts GPU device IDs from environment variables
108+
// if the current environment supports it. This combines the environment check
109+
// with the extraction logic for convenience.
110+
//
111+
// In Kubernetes environments, this function validates that all extracted values are valid GPU UUIDs.
112+
// If any non-UUID value is detected (e.g., "all", "none", "0"), it returns nil to indicate
113+
// a potential user override, allowing the caller to fall back to PodResources API.
114+
//
115+
// Use this function when you have a list of environment variable strings (e.g., from container config).
116+
func ExtractGPUDeviceIDsFromEnvVars(envVars []string) []string {
117+
if !ShouldExtractGPUDeviceIDsFromConfig() {
118+
return nil
119+
}
120+
ids := ExtractGPUDeviceIDs(envVars)
121+
// In Kubernetes, validate UUIDs to detect user overrides
122+
// ECS is excluded because users cannot override env vars set by ECS agent
123+
if env.IsKubernetes() && !areAllValidGPUUUIDs(ids) {
124+
return nil
125+
}
126+
return ids
127+
}
128+
129+
// ExtractGPUDeviceIDsFromEnvMap extracts GPU device IDs from an environment variable map
130+
// if the current environment supports it.
131+
//
132+
// In Kubernetes environments, this function validates that all extracted values are valid GPU UUIDs.
133+
// If any non-UUID value is detected (e.g., "all", "none", "0"), it returns nil to indicate
134+
// a potential user override, allowing the caller to fall back to PodResources API.
135+
//
136+
// Use this function when you have a map of environment variables (e.g., from containerd spec parsing).
137+
func ExtractGPUDeviceIDsFromEnvMap(envs map[string]string) []string {
138+
if !ShouldExtractGPUDeviceIDsFromConfig() {
139+
return nil
140+
}
141+
if val, ok := envs[NVIDIAVisibleDevicesEnvVar]; ok && val != "" {
142+
ids := strings.Split(val, ",")
143+
// In Kubernetes, validate UUIDs to detect user overrides
144+
// ECS is excluded because users cannot override env vars set by ECS agent
145+
if env.IsKubernetes() && !areAllValidGPUUUIDs(ids) {
146+
return nil
147+
}
148+
return ids
149+
}
150+
return nil
151+
}

0 commit comments

Comments
 (0)