Skip to content

Commit 71eafe3

Browse files
committed
Extract GPU device id from container runtime
1 parent ff0abbe commit 71eafe3

File tree

7 files changed

+232
-103
lines changed

7 files changed

+232
-103
lines changed

comp/core/workloadmeta/collectors/internal/containerd/container_builder.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ func buildWorkloadMetaContainer(namespace string, container containerd.Container
154154

155155
workloadContainer.EnvVars = envs
156156
workloadContainer.Hostname = spec.Hostname
157+
workloadContainer.GPUDeviceIDs = util.ExtractGPUDeviceIDsFromEnvMap(envs)
157158
if spec.Linux != nil {
158159
workloadContainer.CgroupPath = extractCgroupPath(spec.Linux.CgroupsPath)
159160
}

comp/core/workloadmeta/collectors/internal/docker/docker.go

Lines changed: 1 addition & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,6 @@ import (
4545
const (
4646
collectorID = "docker"
4747
componentName = "workloadmeta-docker"
48-
49-
// nvidiaVisibleDevicesEnvVar is the environment variable set by NVIDIA container runtime
50-
// to specify which GPUs are visible to the container. Values can be:
51-
// - GPU UUIDs: "GPU-uuid" or "GPU-uuid1,GPU-uuid2" (ECS, some K8s setups)
52-
// - Device indices: "0", "1", "0,1" (local Docker)
53-
// - Special values: "all", "none", "void"
54-
nvidiaVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES"
5548
)
5649

5750
// imageEventActionSbom is an event that we set to create a fake docker event.
@@ -336,7 +329,7 @@ func (c *collector) buildCollectorEvent(ctx context.Context, ev *docker.Containe
336329
PID: container.State.Pid,
337330
RestartCount: container.RestartCount,
338331
Resources: extractResources(container),
339-
GPUDeviceIDs: extractGPUDeviceIDsForECS(container.Config.Env),
332+
GPUDeviceIDs: util.ExtractGPUDeviceIDsFromEnvVars(container.Config.Env),
340333
}
341334

342335
case events.ActionDie, docker.ActionDied:
@@ -747,39 +740,6 @@ func layersFromDockerHistoryAndInspect(history []image.HistoryResponseItem, insp
747740
return layers
748741
}
749742

750-
// extractGPUDeviceIDsForECS extracts GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable,
751-
// but ONLY when running in ECS. For regular Docker containers, the NVIDIA container toolkit adds
752-
// NVIDIA_VISIBLE_DEVICES in a way that's not visible in container.Config.Env (it's added by the
753-
// runtime, not the container config), so we must rely on reading from procfs at metric collection time.
754-
// In ECS, the env var IS visible in container.Config.Env because ECS sets it directly.
755-
// ECS typically sets GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), but users can also set "all" for GPU sharing.
756-
func extractGPUDeviceIDsForECS(envVars []string) []string {
757-
// Only extract from container config in ECS.
758-
// For regular Docker, NVIDIA_VISIBLE_DEVICES is added by the container runtime
759-
// and won't be visible here - the GPU probe will read it from procfs instead.
760-
if !env.IsECS() {
761-
return nil
762-
}
763-
return extractGPUDeviceIDs(envVars)
764-
}
765-
766-
// extractGPUDeviceIDs parses GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable.
767-
// ECS typically sets GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), but users can also set "all" for GPU sharing.
768-
// Special values "all", "none", "void" are preserved and handled in matchByGPUDeviceIDs().
769-
// Empty value returns nil (env var set but empty).
770-
func extractGPUDeviceIDs(envVars []string) []string {
771-
prefix := nvidiaVisibleDevicesEnvVar + "="
772-
for _, e := range envVars {
773-
if value, found := strings.CutPrefix(e, prefix); found {
774-
if value == "" {
775-
return nil
776-
}
777-
return strings.Split(value, ",")
778-
}
779-
}
780-
return nil
781-
}
782-
783743
func extractResources(container container.InspectResponse) workloadmeta.ContainerResources {
784744
var resources workloadmeta.ContainerResources
785745

comp/core/workloadmeta/collectors/internal/docker/docker_test.go

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -18,62 +18,6 @@ import (
1818
workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
1919
)
2020

21-
func Test_extractGPUDeviceIDs(t *testing.T) {
22-
tests := []struct {
23-
name string
24-
envVars []string
25-
expected []string
26-
}{
27-
{
28-
name: "single GPU UUID",
29-
envVars: []string{"PATH=/usr/bin", "NVIDIA_VISIBLE_DEVICES=GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
30-
expected: []string{"GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
31-
},
32-
{
33-
name: "multiple GPU UUIDs",
34-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=GPU-aec058b1-c18e-236e-c14d-49d2990fda0f,GPU-bec058b1-d18e-336e-d14d-59d2990fda1f"},
35-
expected: []string{"GPU-aec058b1-c18e-236e-c14d-49d2990fda0f", "GPU-bec058b1-d18e-336e-d14d-59d2990fda1f"},
36-
},
37-
{
38-
name: "all GPUs",
39-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=all"},
40-
expected: []string{"all"},
41-
},
42-
{
43-
name: "none",
44-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=none"},
45-
expected: []string{"none"},
46-
},
47-
{
48-
name: "void",
49-
envVars: []string{"NVIDIA_VISIBLE_DEVICES=void"},
50-
expected: []string{"void"},
51-
},
52-
{
53-
name: "empty value",
54-
envVars: []string{"NVIDIA_VISIBLE_DEVICES="},
55-
expected: nil,
56-
},
57-
{
58-
name: "no NVIDIA_VISIBLE_DEVICES",
59-
envVars: []string{"PATH=/usr/bin", "HOME=/root"},
60-
expected: nil,
61-
},
62-
{
63-
name: "empty env vars",
64-
envVars: []string{},
65-
expected: nil,
66-
},
67-
}
68-
69-
for _, tt := range tests {
70-
t.Run(tt.name, func(t *testing.T) {
71-
result := extractGPUDeviceIDs(tt.envVars)
72-
assert.Equal(t, tt.expected, result)
73-
})
74-
}
75-
}
76-
7721
func Test_LayersFromDockerHistoryAndInspect(t *testing.T) {
7822
var emptySize int64
7923
var noDiffCmd = "ENV var=dummy"
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
package util
7+
8+
import (
9+
"strings"
10+
11+
"github.com/DataDog/datadog-agent/pkg/config/env"
12+
)
13+
14+
// NVIDIAVisibleDevicesEnvVar is the environment variable set by NVIDIA container runtime
15+
// or Kubernetes device plugin to specify which GPUs are visible to the container.
16+
// Values can be GPU UUIDs (e.g., "GPU-uuid1,GPU-uuid2"), special values like "all", "none", "void",
17+
// or MIG instance identifiers.
18+
const NVIDIAVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES"
19+
20+
// ExtractGPUDeviceIDs parses GPU device identifiers from NVIDIA_VISIBLE_DEVICES environment variable.
21+
// This is the core parsing function that extracts GPU IDs from a list of environment variables.
22+
//
23+
// Returns:
24+
// - GPU UUIDs as a slice (e.g., ["GPU-uuid1", "GPU-uuid2"])
25+
// - Special values like ["all"], ["none"], ["void"] are preserved
26+
// - nil if the env var is not set or has an empty value
27+
func ExtractGPUDeviceIDs(envVars []string) []string {
28+
prefix := NVIDIAVisibleDevicesEnvVar + "="
29+
for _, e := range envVars {
30+
if value, found := strings.CutPrefix(e, prefix); found {
31+
if value == "" {
32+
return nil
33+
}
34+
return strings.Split(value, ",")
35+
}
36+
}
37+
return nil
38+
}
39+
40+
// ShouldExtractGPUDeviceIDsFromConfig returns true if GPU device IDs should be extracted
41+
// from container config/spec based on the current environment.
42+
//
43+
// Supported environments:
44+
// - ECS: env var set by ECS agent in task definition
45+
// - Kubernetes (non-GKE): env var set by NVIDIA device plugin via Allocate() API
46+
//
47+
// Not supported (returns false):
48+
// - Docker standalone: env var injected at runtime by nvidia-container-toolkit, not in config
49+
// - Standalone containerd: env var may be injected at runtime, not in spec
50+
// - GKE: uses custom device plugin + gVisor runtime that ignores NVIDIA_VISIBLE_DEVICES
51+
//
52+
// Note: GKE is not explicitly detected here. On GKE, the env var is either not set in a useful way
53+
// or ignored by the runtime, so extraction returns nil and falls back to PodResources API.
54+
func ShouldExtractGPUDeviceIDsFromConfig() bool {
55+
return env.IsECS() || env.IsKubernetes()
56+
}
57+
58+
// ExtractGPUDeviceIDsFromEnvVars extracts GPU device IDs from environment variables
59+
// if the current environment supports it. This combines the environment check
60+
// with the extraction logic for convenience.
61+
//
62+
// Use this function when you have a list of environment variable strings (e.g., from container config).
63+
func ExtractGPUDeviceIDsFromEnvVars(envVars []string) []string {
64+
if !ShouldExtractGPUDeviceIDsFromConfig() {
65+
return nil
66+
}
67+
return ExtractGPUDeviceIDs(envVars)
68+
}
69+
70+
// ExtractGPUDeviceIDsFromEnvMap extracts GPU device IDs from an environment variable map
71+
// if the current environment supports it.
72+
//
73+
// Use this function when you have a map of environment variables (e.g., from containerd spec parsing).
74+
func ExtractGPUDeviceIDsFromEnvMap(envs map[string]string) []string {
75+
if !ShouldExtractGPUDeviceIDsFromConfig() {
76+
return nil
77+
}
78+
if val, ok := envs[NVIDIAVisibleDevicesEnvVar]; ok && val != "" {
79+
return strings.Split(val, ",")
80+
}
81+
return nil
82+
}
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
package util
7+
8+
import (
9+
"testing"
10+
11+
"github.com/stretchr/testify/assert"
12+
)
13+
14+
// envMapToSlice converts a map of env vars to slice format (KEY=VALUE)
15+
func envMapToSlice(envs map[string]string) []string {
16+
if envs == nil {
17+
return nil
18+
}
19+
result := make([]string, 0, len(envs))
20+
for k, v := range envs {
21+
result = append(result, k+"="+v)
22+
}
23+
return result
24+
}
25+
26+
// TestExtractGPUDeviceIDs tests ExtractGPUDeviceIDsFromEnvVars and ExtractGPUDeviceIDsFromEnvMap
27+
// with the same test cases, ensuring consistent behavior across different environments.
28+
func TestExtractGPUDeviceIDs(t *testing.T) {
29+
tests := []struct {
30+
name string
31+
envMap map[string]string
32+
isECS bool
33+
isK8s bool
34+
expected []string
35+
}{
36+
// Environment detection tests
37+
{
38+
name: "ECS extracts GPU",
39+
envMap: map[string]string{"NVIDIA_VISIBLE_DEVICES": "GPU-xxx"},
40+
isECS: true,
41+
expected: []string{"GPU-xxx"},
42+
},
43+
{
44+
name: "Kubernetes extracts GPU",
45+
envMap: map[string]string{"NVIDIA_VISIBLE_DEVICES": "GPU-aaa"},
46+
isK8s: true,
47+
expected: []string{"GPU-aaa"},
48+
},
49+
{
50+
name: "Standalone returns nil",
51+
envMap: map[string]string{"NVIDIA_VISIBLE_DEVICES": "GPU-xxx"},
52+
expected: nil,
53+
},
54+
// Parsing tests (in K8s environment)
55+
{
56+
name: "single GPU UUID",
57+
envMap: map[string]string{"PATH": "/usr/bin", "NVIDIA_VISIBLE_DEVICES": "GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
58+
isK8s: true,
59+
expected: []string{"GPU-aec058b1-c18e-236e-c14d-49d2990fda0f"},
60+
},
61+
{
62+
name: "multiple GPU UUIDs",
63+
envMap: map[string]string{"NVIDIA_VISIBLE_DEVICES": "GPU-aaa,GPU-bbb,GPU-ccc"},
64+
isK8s: true,
65+
expected: []string{"GPU-aaa", "GPU-bbb", "GPU-ccc"},
66+
},
67+
{
68+
name: "all GPUs",
69+
envMap: map[string]string{"NVIDIA_VISIBLE_DEVICES": "all"},
70+
isK8s: true,
71+
expected: []string{"all"},
72+
},
73+
{
74+
name: "none",
75+
envMap: map[string]string{"NVIDIA_VISIBLE_DEVICES": "none"},
76+
isK8s: true,
77+
expected: []string{"none"},
78+
},
79+
{
80+
name: "void",
81+
envMap: map[string]string{"NVIDIA_VISIBLE_DEVICES": "void"},
82+
isK8s: true,
83+
expected: []string{"void"},
84+
},
85+
{
86+
name: "MIG instance",
87+
envMap: map[string]string{"NVIDIA_VISIBLE_DEVICES": "MIG-abc123-def456"},
88+
isK8s: true,
89+
expected: []string{"MIG-abc123-def456"},
90+
},
91+
{
92+
name: "empty value",
93+
envMap: map[string]string{"NVIDIA_VISIBLE_DEVICES": ""},
94+
isK8s: true,
95+
expected: nil,
96+
},
97+
{
98+
name: "no NVIDIA_VISIBLE_DEVICES",
99+
envMap: map[string]string{"PATH": "/usr/bin"},
100+
isK8s: true,
101+
expected: nil,
102+
},
103+
{
104+
name: "nil map",
105+
envMap: nil,
106+
isK8s: true,
107+
expected: nil,
108+
},
109+
}
110+
111+
for _, tt := range tests {
112+
t.Run(tt.name, func(t *testing.T) {
113+
if tt.isECS {
114+
t.Setenv("ECS_CONTAINER_METADATA_URI_V4", "http://169.254.170.2/v4")
115+
}
116+
if tt.isK8s {
117+
t.Setenv("KUBERNETES_SERVICE_PORT", "443")
118+
}
119+
120+
// Test map-based function (used by containerd)
121+
resultMap := ExtractGPUDeviceIDsFromEnvMap(tt.envMap)
122+
assert.Equal(t, tt.expected, resultMap, "ExtractGPUDeviceIDsFromEnvMap")
123+
124+
// Test slice-based function (used by docker)
125+
envSlice := envMapToSlice(tt.envMap)
126+
resultSlice := ExtractGPUDeviceIDsFromEnvVars(envSlice)
127+
assert.Equal(t, tt.expected, resultSlice, "ExtractGPUDeviceIDsFromEnvVars")
128+
})
129+
}
130+
}

0 commit comments

Comments
 (0)