@@ -20,6 +20,7 @@ import (
20
20
workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
21
21
"github.com/DataDog/datadog-agent/pkg/errors"
22
22
"github.com/DataDog/datadog-agent/pkg/gpu/cuda"
23
+ "github.com/DataDog/datadog-agent/pkg/security/utils"
23
24
"github.com/DataDog/datadog-agent/pkg/util/ktime"
24
25
"github.com/DataDog/datadog-agent/pkg/util/log"
25
26
)
@@ -41,7 +42,7 @@ type systemContext struct {
41
42
smVersionSet map [uint32 ]struct {}
42
43
43
44
// cudaSymbols maps each executable file path to its Fatbin file data
44
- cudaSymbols map [string ]* symbolsEntry
45
+ cudaSymbols map [symbolFileIdentifier ]* symbolsEntry
45
46
46
47
// pidMaps maps each process ID to its memory maps
47
48
pidMaps map [int ][]* procfs.ProcMap
@@ -77,6 +78,14 @@ type systemContext struct {
77
78
fatbinTelemetry * fatbinTelemetry
78
79
}
79
80
81
+ // symbolFileIdentifier holds the inode and file size of a symbol file, which we use to avoid
82
+ // parsing the same file multiple times when it has different paths (e.g., symlinks in /proc/PID/root)
83
+ // We add fileSize to the identifier to mitigate possible issues with inode reuse.
84
+ type symbolFileIdentifier struct {
85
+ inode int
86
+ fileSize int64
87
+ }
88
+
80
89
// contextTelemetry holds telemetry elements for the context
81
90
type contextTelemetry struct {
82
91
symbolCacheSize telemetry.Gauge
@@ -106,7 +115,7 @@ func getSystemContext(nvmlLib nvml.Interface, procRoot string, wmeta workloadmet
106
115
ctx := & systemContext {
107
116
deviceSmVersions : make (map [int ]int ),
108
117
smVersionSet : make (map [uint32 ]struct {}),
109
- cudaSymbols : make (map [string ]* symbolsEntry ),
118
+ cudaSymbols : make (map [symbolFileIdentifier ]* symbolsEntry ),
110
119
pidMaps : make (map [int ][]* procfs.ProcMap ),
111
120
nvmlLib : nvmlLib ,
112
121
procRoot : procRoot ,
@@ -186,8 +195,23 @@ func (ctx *systemContext) fillDeviceInfo() error {
186
195
return nil
187
196
}
188
197
198
+ func buildSymbolFileIdentifier (path string ) (symbolFileIdentifier , error ) {
199
+ stat , err := utils .UnixStat (path )
200
+ if err != nil {
201
+ return symbolFileIdentifier {}, fmt .Errorf ("error getting file info: %w" , err )
202
+ }
203
+
204
+ return symbolFileIdentifier {inode : int (stat .Ino ), fileSize : stat .Size }, nil
205
+ }
206
+
189
207
func (ctx * systemContext ) getCudaSymbols (path string ) (* symbolsEntry , error ) {
190
- if data , ok := ctx .cudaSymbols [path ]; ok {
208
+ fileIdent , err := buildSymbolFileIdentifier (path )
209
+ if err != nil {
210
+ // an error means we cannot access the file, so returning makes sense as we will fail later anyways
211
+ return nil , fmt .Errorf ("error building symbol file identifier: %w" , err )
212
+ }
213
+
214
+ if data , ok := ctx .cudaSymbols [fileIdent ]; ok {
191
215
data .updateLastUsedTime ()
192
216
return data , nil
193
217
}
@@ -210,7 +234,7 @@ func (ctx *systemContext) getCudaSymbols(path string) (*symbolsEntry, error) {
210
234
211
235
wrapper := & symbolsEntry {Symbols : data }
212
236
wrapper .updateLastUsedTime ()
213
- ctx .cudaSymbols [path ] = wrapper
237
+ ctx .cudaSymbols [fileIdent ] = wrapper
214
238
215
239
ctx .telemetry .symbolCacheSize .Set (float64 (len (ctx .cudaSymbols )))
216
240
0 commit comments