Skip to content

Commit d63a0ad

Browse files
authored
[dyninst] Expose more dyninst debug data via flares (#47953)
Co-authored-by: piotr.bejda <piotr.bejda@datadoghq.com>
1 parent 0af0a9f commit d63a0ad

File tree

10 files changed

+749
-0
lines changed

10 files changed

+749
-0
lines changed

pkg/dyninst/actuator/actuator.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,22 @@ func (a *Actuator) Stats() map[string]any {
9999
}
100100
}
101101

102+
// DebugInfo returns a snapshot of the actuator's internal state for debugging.
103+
func (a *Actuator) DebugInfo() *DebugInfo {
104+
debugInfoChan := make(chan DebugInfo, 1)
105+
select {
106+
case <-a.shuttingDown:
107+
return nil
108+
case a.events <- eventGetDebugInfo{debugInfoChan: debugInfoChan}:
109+
select {
110+
case <-a.shuttingDown:
111+
return nil
112+
case info := <-debugInfoChan:
113+
return &info
114+
}
115+
}
116+
}
117+
102118
// NewActuator creates a new Actuator instance.
103119
func NewActuator(cfg Config) *Actuator {
104120
if cfg.DiscoveredTypesLimit == 0 {

pkg/dyninst/actuator/debug_info.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
//go:build linux_bpf
7+
8+
package actuator
9+
10+
import (
11+
"slices"
12+
13+
"github.com/DataDog/datadog-agent/pkg/dyninst/ir"
14+
)
15+
16+
// DebugInfo contains a snapshot of the actuator's internal state for
17+
// debugging purposes, exposed via the flare mechanism.
18+
type DebugInfo struct {
19+
Processes []ProcessDebugInfo `json:"processes"`
20+
Programs []ProgramDebugInfo `json:"programs"`
21+
DiscoveredTypes map[string][]string `json:"discovered_types"`
22+
CurrentlyLoading *ir.ProgramID `json:"currently_loading"`
23+
QueuedLoading []ir.ProgramID `json:"queued_loading"`
24+
Counters CountersDebugInfo `json:"counters"`
25+
CircuitBreaker CircuitBreakerInfo `json:"circuit_breaker"`
26+
}
27+
28+
// ProcessDebugInfo contains debug information about a single process in the
29+
// actuator's state machine.
30+
type ProcessDebugInfo struct {
31+
PID int32 `json:"pid"`
32+
State string `json:"state"`
33+
Service string `json:"service"`
34+
CurrentProgram ir.ProgramID `json:"current_program"`
35+
ProbeCount int `json:"probe_count"`
36+
}
37+
38+
// ProgramDebugInfo contains debug information about a single program in the
39+
// actuator's state machine.
40+
type ProgramDebugInfo struct {
41+
ProgramID ir.ProgramID `json:"program_id"`
42+
State string `json:"state"`
43+
ProcessPID int32 `json:"process_pid"`
44+
ProbeCount int `json:"probe_count"`
45+
NeedsRecompilation bool `json:"needs_recompilation"`
46+
}
47+
48+
// CountersDebugInfo contains cumulative counters from the actuator.
49+
type CountersDebugInfo struct {
50+
Loaded uint64 `json:"loaded"`
51+
LoadFailed uint64 `json:"load_failed"`
52+
Attached uint64 `json:"attached"`
53+
AttachFailed uint64 `json:"attach_failed"`
54+
Detached uint64 `json:"detached"`
55+
Unloaded uint64 `json:"unloaded"`
56+
TypeRecompilationsTriggered uint64 `json:"type_recompilations_triggered"`
57+
}
58+
59+
// CircuitBreakerInfo contains the circuit breaker configuration.
60+
type CircuitBreakerInfo struct {
61+
Interval string `json:"interval"`
62+
PerProbeCPULimit float64 `json:"per_probe_cpu_limit"`
63+
AllProbesCPULimit float64 `json:"all_probes_cpu_limit"`
64+
InterruptOverhead string `json:"interrupt_overhead"`
65+
}
66+
67+
// debugInfo returns a snapshot of the state machine for debugging.
68+
func (s *state) debugInfo() DebugInfo {
69+
processes := make([]ProcessDebugInfo, 0, len(s.processes))
70+
for _, p := range s.processes {
71+
processes = append(processes, ProcessDebugInfo{
72+
PID: p.processID.PID,
73+
State: p.state.String(),
74+
Service: p.service,
75+
CurrentProgram: p.currentProgram,
76+
ProbeCount: len(p.probes),
77+
})
78+
}
79+
80+
programs := make([]ProgramDebugInfo, 0, len(s.programs))
81+
for _, p := range s.programs {
82+
programs = append(programs, ProgramDebugInfo{
83+
ProgramID: p.id,
84+
State: p.state.String(),
85+
ProcessPID: p.processID.PID,
86+
ProbeCount: len(p.config),
87+
NeedsRecompilation: p.needsRecompilation,
88+
})
89+
}
90+
91+
discoveredTypes := make(map[string][]string, len(s.discoveredTypes))
92+
for svc, types := range s.discoveredTypes {
93+
discoveredTypes[svc] = slices.Clone(types)
94+
}
95+
96+
var currentlyLoading *ir.ProgramID
97+
if s.currentlyLoading != nil {
98+
id := s.currentlyLoading.id
99+
currentlyLoading = &id
100+
}
101+
102+
queuedLoading := make([]ir.ProgramID, 0, s.queuedLoading.len())
103+
for _, item := range s.queuedLoading.m {
104+
queuedLoading = append(queuedLoading, item.value.id)
105+
}
106+
107+
return DebugInfo{
108+
Processes: processes,
109+
Programs: programs,
110+
DiscoveredTypes: discoveredTypes,
111+
CurrentlyLoading: currentlyLoading,
112+
QueuedLoading: queuedLoading,
113+
Counters: CountersDebugInfo{
114+
Loaded: s.counters.loaded,
115+
LoadFailed: s.counters.loadFailed,
116+
Attached: s.counters.attached,
117+
AttachFailed: s.counters.attachFailed,
118+
Detached: s.counters.detached,
119+
Unloaded: s.counters.unloaded,
120+
TypeRecompilationsTriggered: s.counters.typeRecompilationsTriggered,
121+
},
122+
CircuitBreaker: CircuitBreakerInfo{
123+
Interval: s.breakerCfg.Interval.String(),
124+
PerProbeCPULimit: s.breakerCfg.PerProbeCPULimit,
125+
AllProbesCPULimit: s.breakerCfg.AllProbesCPULimit,
126+
InterruptOverhead: s.breakerCfg.InterruptOverhead.String(),
127+
},
128+
}
129+
}
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
//go:build linux_bpf
7+
8+
package actuator
9+
10+
import (
11+
"testing"
12+
13+
"github.com/stretchr/testify/assert"
14+
15+
"github.com/DataDog/datadog-agent/pkg/dyninst/ir"
16+
procinfo "github.com/DataDog/datadog-agent/pkg/dyninst/process"
17+
"github.com/DataDog/datadog-agent/pkg/dyninst/rcjson"
18+
)
19+
20+
func TestStateDebugInfoEmpty(t *testing.T) {
21+
s := newState(Config{DiscoveredTypesLimit: 512})
22+
info := s.debugInfo()
23+
24+
assert.Empty(t, info.Processes)
25+
assert.Empty(t, info.Programs)
26+
assert.Empty(t, info.DiscoveredTypes)
27+
assert.Nil(t, info.CurrentlyLoading)
28+
assert.Empty(t, info.QueuedLoading)
29+
assert.Equal(t, uint64(0), info.Counters.Loaded)
30+
}
31+
32+
func TestStateDebugInfoWithProcessesAndPrograms(t *testing.T) {
33+
s := newState(Config{DiscoveredTypesLimit: 512})
34+
35+
pid := procinfo.ID{PID: 42}
36+
probe := &rcjson.SnapshotProbe{
37+
LogProbeCommon: rcjson.LogProbeCommon{
38+
ProbeCommon: rcjson.ProbeCommon{
39+
ID: "probe-1",
40+
Version: 1,
41+
Where: &rcjson.Where{MethodName: "main"},
42+
},
43+
},
44+
}
45+
s.processes[pid] = &process{
46+
processID: pid,
47+
state: processStateAttached,
48+
service: "my-service",
49+
probes: map[probeKey]ir.ProbeDefinition{{id: "probe-1", version: 1}: probe},
50+
}
51+
52+
progID := ir.ProgramID(1)
53+
s.programs[progID] = &program{
54+
id: progID,
55+
state: programStateLoaded,
56+
processID: pid,
57+
config: []ir.ProbeDefinition{probe},
58+
}
59+
s.processes[pid].currentProgram = progID
60+
61+
s.discoveredTypes["my-service"] = []string{"MyType", "OtherType"}
62+
63+
info := s.debugInfo()
64+
65+
assert.Len(t, info.Processes, 1)
66+
assert.Equal(t, int32(42), info.Processes[0].PID)
67+
assert.Equal(t, "Attached", info.Processes[0].State)
68+
assert.Equal(t, "my-service", info.Processes[0].Service)
69+
assert.Equal(t, 1, info.Processes[0].ProbeCount)
70+
assert.Equal(t, progID, info.Processes[0].CurrentProgram)
71+
72+
assert.Len(t, info.Programs, 1)
73+
assert.Equal(t, progID, info.Programs[0].ProgramID)
74+
assert.Equal(t, "Loaded", info.Programs[0].State)
75+
assert.Equal(t, int32(42), info.Programs[0].ProcessPID)
76+
assert.Equal(t, 1, info.Programs[0].ProbeCount)
77+
assert.False(t, info.Programs[0].NeedsRecompilation)
78+
79+
assert.Equal(t, []string{"MyType", "OtherType"}, info.DiscoveredTypes["my-service"])
80+
assert.Nil(t, info.CurrentlyLoading)
81+
assert.Empty(t, info.QueuedLoading)
82+
}
83+
84+
func TestStateDebugInfoCounters(t *testing.T) {
85+
s := newState(Config{DiscoveredTypesLimit: 512})
86+
s.counters.loaded = 5
87+
s.counters.loadFailed = 2
88+
s.counters.attached = 3
89+
s.counters.typeRecompilationsTriggered = 1
90+
91+
info := s.debugInfo()
92+
93+
assert.Equal(t, uint64(5), info.Counters.Loaded)
94+
assert.Equal(t, uint64(2), info.Counters.LoadFailed)
95+
assert.Equal(t, uint64(3), info.Counters.Attached)
96+
assert.Equal(t, uint64(1), info.Counters.TypeRecompilationsTriggered)
97+
}
98+
99+
func TestStateDebugInfoQueuedPrograms(t *testing.T) {
100+
s := newState(Config{DiscoveredTypesLimit: 512})
101+
102+
pid := procinfo.ID{PID: 10}
103+
prog := &program{
104+
id: ir.ProgramID(7),
105+
state: programStateQueued,
106+
processID: pid,
107+
}
108+
s.programs[prog.id] = prog
109+
s.queuedLoading.pushBack(prog)
110+
111+
info := s.debugInfo()
112+
113+
assert.Len(t, info.QueuedLoading, 1)
114+
assert.Equal(t, ir.ProgramID(7), info.QueuedLoading[0])
115+
assert.Nil(t, info.CurrentlyLoading)
116+
}
117+
118+
func TestStateDebugInfoCurrentlyLoading(t *testing.T) {
119+
s := newState(Config{DiscoveredTypesLimit: 512})
120+
s.currentlyLoading = &program{id: ir.ProgramID(3)}
121+
122+
info := s.debugInfo()
123+
124+
assert.NotNil(t, info.CurrentlyLoading)
125+
assert.Equal(t, ir.ProgramID(3), *info.CurrentlyLoading)
126+
}

pkg/dyninst/actuator/state.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,10 @@ func handleEvent(
382382
ev.metricsChan <- sm.Metrics()
383383
return nil
384384

385+
case eventGetDebugInfo:
386+
ev.debugInfoChan <- sm.debugInfo()
387+
return nil
388+
385389
case eventHeartbeatCheck:
386390
handleHeartbeatCheck(sm, effects)
387391

pkg/dyninst/actuator/state_events.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,15 @@ func (e eventGetMetrics) String() string {
136136
return "eventGetMetrics{}"
137137
}
138138

139+
type eventGetDebugInfo struct {
140+
baseEvent
141+
debugInfoChan chan<- DebugInfo
142+
}
143+
144+
func (e eventGetDebugInfo) String() string {
145+
return "eventGetDebugInfo{}"
146+
}
147+
139148
type eventHeartbeatCheck struct {
140149
baseEvent
141150
}

0 commit comments

Comments
 (0)