Skip to content

Commit 5075c12

Browse files
authored
[EBPF] gpu: use inode and file size to index symbol files (#33937)
1 parent 91fb50c commit 5075c12

File tree

3 files changed

+80
-5
lines changed

3 files changed

+80
-5
lines changed

pkg/gpu/context.go

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
2121
"github.com/DataDog/datadog-agent/pkg/errors"
2222
"github.com/DataDog/datadog-agent/pkg/gpu/cuda"
23+
"github.com/DataDog/datadog-agent/pkg/security/utils"
2324
"github.com/DataDog/datadog-agent/pkg/util/ktime"
2425
"github.com/DataDog/datadog-agent/pkg/util/log"
2526
)
@@ -41,7 +42,7 @@ type systemContext struct {
4142
smVersionSet map[uint32]struct{}
4243

4344
// cudaSymbols maps each executable file path to its Fatbin file data
44-
cudaSymbols map[string]*symbolsEntry
45+
cudaSymbols map[symbolFileIdentifier]*symbolsEntry
4546

4647
// pidMaps maps each process ID to its memory maps
4748
pidMaps map[int][]*procfs.ProcMap
@@ -77,6 +78,14 @@ type systemContext struct {
7778
fatbinTelemetry *fatbinTelemetry
7879
}
7980

81+
// symbolFileIdentifier holds the inode and file size of a symbol file, which we use to avoid
82+
// parsing the same file multiple times when it has different paths (e.g., symlinks in /proc/PID/root)
83+
// We add fileSize to the identifier to mitigate possible issues with inode reuse.
84+
type symbolFileIdentifier struct {
85+
inode int
86+
fileSize int64
87+
}
88+
8089
// contextTelemetry holds telemetry elements for the context
8190
type contextTelemetry struct {
8291
symbolCacheSize telemetry.Gauge
@@ -106,7 +115,7 @@ func getSystemContext(nvmlLib nvml.Interface, procRoot string, wmeta workloadmet
106115
ctx := &systemContext{
107116
deviceSmVersions: make(map[int]int),
108117
smVersionSet: make(map[uint32]struct{}),
109-
cudaSymbols: make(map[string]*symbolsEntry),
118+
cudaSymbols: make(map[symbolFileIdentifier]*symbolsEntry),
110119
pidMaps: make(map[int][]*procfs.ProcMap),
111120
nvmlLib: nvmlLib,
112121
procRoot: procRoot,
@@ -186,8 +195,23 @@ func (ctx *systemContext) fillDeviceInfo() error {
186195
return nil
187196
}
188197

198+
func buildSymbolFileIdentifier(path string) (symbolFileIdentifier, error) {
199+
stat, err := utils.UnixStat(path)
200+
if err != nil {
201+
return symbolFileIdentifier{}, fmt.Errorf("error getting file info: %w", err)
202+
}
203+
204+
return symbolFileIdentifier{inode: int(stat.Ino), fileSize: stat.Size}, nil
205+
}
206+
189207
func (ctx *systemContext) getCudaSymbols(path string) (*symbolsEntry, error) {
190-
if data, ok := ctx.cudaSymbols[path]; ok {
208+
fileIdent, err := buildSymbolFileIdentifier(path)
209+
if err != nil {
210+
// an error means we cannot access the file, so returning makes sense as we will fail later anyways
211+
return nil, fmt.Errorf("error building symbol file identifier: %w", err)
212+
}
213+
214+
if data, ok := ctx.cudaSymbols[fileIdent]; ok {
191215
data.updateLastUsedTime()
192216
return data, nil
193217
}
@@ -210,7 +234,7 @@ func (ctx *systemContext) getCudaSymbols(path string) (*symbolsEntry, error) {
210234

211235
wrapper := &symbolsEntry{Symbols: data}
212236
wrapper.updateLastUsedTime()
213-
ctx.cudaSymbols[path] = wrapper
237+
ctx.cudaSymbols[fileIdent] = wrapper
214238

215239
ctx.telemetry.symbolCacheSize.Set(float64(len(ctx.cudaSymbols)))
216240

pkg/gpu/context_test.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
package gpu
99

1010
import (
11+
"os"
1112
"strconv"
1213
"strings"
1314
"testing"
@@ -204,3 +205,50 @@ func TestGetCurrentActiveGpuDevice(t *testing.T) {
204205
})
205206
}
206207
}
208+
209+
func TestBuildSymbolFileIdentifier(t *testing.T) {
210+
// Create a file, then a symlink to it
211+
// and check that the identifier is the same
212+
// for both files.
213+
dir := t.TempDir()
214+
filePath := dir + "/file"
215+
copyPath := dir + "/copy"
216+
differentPath := dir + "/different"
217+
symlinkPath := dir + "/symlink"
218+
219+
data := []byte("hello")
220+
// create the original file
221+
err := os.WriteFile(filePath, data, 0644)
222+
require.NoError(t, err)
223+
224+
// create a symlink to the original file, which should have the same identifier
225+
err = os.Symlink(filePath, symlinkPath)
226+
require.NoError(t, err)
227+
228+
// a copy is a different inode, so it should have a different identifier
229+
// even with the same size
230+
err = os.WriteFile(copyPath, data, 0644)
231+
require.NoError(t, err)
232+
233+
// a different file with different content should have a different identifier
234+
// as it's different content and different inode
235+
err = os.WriteFile(differentPath, []byte("different"), 0644)
236+
require.NoError(t, err)
237+
238+
origIdentifier, err := buildSymbolFileIdentifier(filePath)
239+
require.NoError(t, err)
240+
241+
symlinkIdentifier, err := buildSymbolFileIdentifier(symlinkPath)
242+
require.NoError(t, err)
243+
244+
copyIdentifier, err := buildSymbolFileIdentifier(copyPath)
245+
require.NoError(t, err)
246+
247+
differentIdentifier, err := buildSymbolFileIdentifier(differentPath)
248+
require.NoError(t, err)
249+
250+
require.Equal(t, origIdentifier, symlinkIdentifier)
251+
require.NotEqual(t, origIdentifier, copyIdentifier)
252+
require.NotEqual(t, origIdentifier, differentIdentifier)
253+
require.NotEqual(t, copyIdentifier, differentIdentifier)
254+
}

pkg/gpu/stream_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,10 @@ func TestKernelLaunchesIncludeEnrichedKernelData(t *testing.T) {
350350
ConstantMem: constantMem,
351351
})
352352

353-
sysCtx.cudaSymbols[procBinPath] = &symbolsEntry{
353+
procBinIdent, err := buildSymbolFileIdentifier(procBinPath)
354+
require.NoError(t, err)
355+
356+
sysCtx.cudaSymbols[procBinIdent] = &symbolsEntry{
354357
Symbols: &cuda.Symbols{
355358
SymbolTable: map[uint64]string{kernAddress: kernName},
356359
Fatbin: fatbin,

0 commit comments

Comments
 (0)