Skip to content

Commit 4fed36d

Browse files
authored
[WP] Refactor custom events to properly send them (#47586)
### What does this PR do? - refactor a bit how custom events are handled so we always send them each time an action report is sent. - add a new status to the kill action to handle early exit processes. When we try to kill one process (scope process), if the process exits before we kill it, we abort the kill instead of sending `kill_queued` status. Note: This PR only fixes issues for scope process. Other changes will be needed in the future to handle other hedge cases. ### Motivation Now every time an action report is sent, we have an associated remediation status event sent. Previously, we had situation where one was emitted but not the second one. Co-authored-by: theo.putegnat <theo.putegnat@datadoghq.com>
1 parent 9ebd458 commit 4fed36d

File tree

12 files changed

+527
-72
lines changed

12 files changed

+527
-72
lines changed

pkg/security/module/server.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,15 @@ func (a *APIServer) updateCustomEventTags(msg *api.SecurityEventMessage) {
328328
}
329329
}
330330

331+
// SendCustomEventKillAction sends a custom remediation event for each resolved kill action report
332+
func SendCustomEventKillAction(probe *sprobe.Probe, tags []string, actionReports []model.ActionReport) {
333+
for _, report := range actionReports {
334+
if _, ok := report.(*sprobe.KillActionReport); ok {
335+
probe.SendCustomEventKillAction(report, tags)
336+
}
337+
}
338+
}
339+
331340
func (a *APIServer) start(ctx context.Context) {
332341
ticker := time.NewTicker(200 * time.Millisecond)
333342
defer ticker.Stop()
@@ -359,6 +368,9 @@ func (a *APIServer) start(ctx context.Context) {
359368
return false
360369
}
361370

371+
// For kill actions, send a custom remediation event per resolved kill report
372+
// If a rule contains multiple kill, a custom event will be sent for each action
373+
SendCustomEventKillAction(a.probe, msg.tags, msg.actionReports)
362374
if a.containerFilter != nil {
363375
containerName, imageName, podNamespace := utils.GetContainerFilterTags(msg.tags)
364376
if a.containerFilter.IsExcluded(nil, containerName, imageName, podNamespace) {

pkg/security/probe/actions.go

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,23 @@ const (
3333
// KillActionStatusQueued indicates the kill action was queued until the end of the first rule period
3434
KillActionStatusQueued KillActionStatus = "kill_queued"
3535
// KillActionStatusPartiallyPerformed indicates the kill action was performed on some processes but not all
36-
KillActionStatusPartiallyPerformed = "partially_performed"
36+
KillActionStatusPartiallyPerformed KillActionStatus = "partially_performed"
37+
// KillActionStatusKillAborted indicates the kill action was aborted because the process exited before the kill was performed
38+
KillActionStatusKillAborted KillActionStatus = "kill_aborted"
3739

3840
// maxRetryForMsgWithKillAction is the maximum number of retries for a kill action
3941
// - a kill can be queued up to the end of the first disarmer period (1min by default)
4042
// - so, we set the server retry period to 1min and 2sec (+2sec to have the time to trigger the kill and wait to catch the process exit)
4143
maxRetryForMsgWithKillAction = 62
4244
)
4345

46+
// RemediationContainerContext represents the container context for remediation events (e.g. container ID and created_at).
47+
// Defined here so KillActionReport can use it on all platforms; Linux-specific remediation logic lives in remediations_linux.go.
48+
type RemediationContainerContext struct {
49+
CreatedAt uint64 `json:"created_at,omitempty"`
50+
ID string `json:"id,omitempty"`
51+
}
52+
4453
// KillActionReport defines a kill action reports
4554
type KillActionReport struct {
4655
sync.RWMutex
@@ -55,9 +64,10 @@ type KillActionReport struct {
5564
DisarmerType string
5665

5766
// internal
58-
Pid uint32
59-
resolved bool
60-
rule *rules.Rule
67+
Pid uint32
68+
resolved bool
69+
rule *rules.Rule
70+
containerContext RemediationContainerContext // This is an internal field needed for remediation status events
6171
}
6272

6373
// JKillActionReport used to serialize date
@@ -128,3 +138,10 @@ func (k *KillActionReport) IsMatchingRule(ruleID eval.RuleID) bool {
128138

129139
return k.rule.ID == ruleID
130140
}
141+
142+
// GetRemediationContainerContext returns the container ID and created_at when the process was in a container, otherwise empty/zero.
143+
func (k *KillActionReport) GetRemediationContainerContext() RemediationContainerContext {
144+
k.RLock()
145+
defer k.RUnlock()
146+
return k.containerContext
147+
}

pkg/security/probe/probe.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ type PlatformProbe interface {
5959
GetEventTags(_ containerutils.ContainerID) []string
6060
EnableEnforcement(bool)
6161
ReplayEvents()
62+
SendCustomEventKillAction(_ model.ActionReport, _ []string)
6263
}
6364

6465
var probeTelemetry = struct {
@@ -357,6 +358,11 @@ func (p *Probe) AddDiscarderPushedCallback(cb DiscarderPushedCallback) {
357358
p.PlatformProbe.AddDiscarderPushedCallback(cb)
358359
}
359360

361+
// SendCustomEventKillAction sends a custom remediation-style event for a resolved kill action report.
362+
func (p *Probe) SendCustomEventKillAction(report model.ActionReport, tags []string) {
363+
p.PlatformProbe.SendCustomEventKillAction(report, tags)
364+
}
365+
360366
// DispatchCustomEvent sends a custom event to the probe event handler
361367
func (p *Probe) DispatchCustomEvent(rule *rules.Rule, event *events.CustomEvent) {
362368
p.logTraceEvent(event.GetEventType(), event)

pkg/security/probe/probe_ebpf.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3366,7 +3366,7 @@ func (p *EBPFProbe) HandleActions(ctx *eval.Context, rule *rules.Rule) {
33663366
p.probe.onRuleActionPerformed(rule, action.Def)
33673367
}
33683368
if report != nil {
3369-
p.HandleKillRemediation(rule, ev, report, action)
3369+
p.HandleKillRemediation(rule, ev, action)
33703370
}
33713371

33723372
case action.Def.CoreDump != nil:

pkg/security/probe/probe_ebpfless.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,9 @@ func (p *EBPFLessProbe) EnableEnforcement(state bool) {
720720
p.processKiller.SetState(state)
721721
}
722722

723+
// SendCustomEventKillAction is a no-op for EBPFLess (remediation custom events use EBPF probe).
724+
func (p *EBPFLessProbe) SendCustomEventKillAction(_ model.ActionReport, _ []string) {}
725+
723726
// GetAgentContainerContext returns the agent container context
724727
func (p *EBPFLessProbe) GetAgentContainerContext() *events.AgentContainerContext {
725728
return p.probe.GetAgentContainerContext()

pkg/security/probe/probe_others.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,6 @@ func (p *Probe) GetAgentContainerContext() *events.AgentContainerContext {
133133
// Walk iterates through the entire tree and call the provided callback on each entry
134134
func (p *Probe) Walk(_ func(*model.ProcessCacheEntry)) {
135135
}
136+
137+
// SendCustomEventKillAction is a no-op on unsupported platforms (remediation custom events are Linux-only).
138+
func (p *Probe) SendCustomEventKillAction(_ model.ActionReport, _ []string) {}

pkg/security/probe/probe_windows.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1593,6 +1593,9 @@ func (p *WindowsProbe) EnableEnforcement(state bool) {
15931593
p.processKiller.SetState(state)
15941594
}
15951595

1596+
// SendCustomEventKillAction is a no-op on Windows (remediation custom events are Linux-only).
1597+
func (p *WindowsProbe) SendCustomEventKillAction(_ model.ActionReport, _ []string) {}
1598+
15961599
// NewProbe instantiates a new runtime security agent probe
15971600
func NewProbe(config *config.Config, hostname string, opts Opts) (*Probe, error) {
15981601
opts.normalize()

pkg/security/probe/process_killer.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,12 @@ func (p *ProcessKiller) HandleProcessExited(event *model.Event) {
241241
defer report.Unlock()
242242

243243
if report.Pid == event.ProcessContext.Pid {
244+
if report.Scope == "process" {
245+
if report.Status == KillActionStatusQueued {
246+
// The process exited before the kill was performed
247+
report.Status = KillActionStatusKillAborted
248+
}
249+
}
244250
report.ExitedAt = event.ProcessContext.ExitTime
245251
report.resolved = true
246252
return true
@@ -324,6 +330,10 @@ func (p *ProcessKiller) KillAndReport(kill *rules.KillDefinition, rule *rules.Ru
324330
Pid: ev.ProcessContext.Pid,
325331
rule: rule,
326332
}
333+
if !ev.ProcessContext.Process.ContainerContext.IsNull() {
334+
report.containerContext.ID = string(ev.ProcessContext.Process.ContainerContext.ContainerID)
335+
report.containerContext.CreatedAt = ev.ProcessContext.Process.ContainerContext.CreatedAt
336+
}
327337
if dismantled {
328338
report.Status = KillActionStatusRuleDismantled
329339
seclog.Warnf("skipping kill action of rule `%s` because it has been dismantled", rule.ID)
@@ -423,6 +433,10 @@ func (p *ProcessKiller) KillAndReport(kill *rules.KillDefinition, rule *rules.Ru
423433
Pid: ev.ProcessContext.Pid,
424434
rule: rule,
425435
}
436+
if !ev.ProcessContext.Process.ContainerContext.IsNull() {
437+
report.containerContext.ID = string(ev.ProcessContext.Process.ContainerContext.ContainerID)
438+
report.containerContext.CreatedAt = ev.ProcessContext.Process.ContainerContext.CreatedAt
439+
}
426440

427441
if disarmer != nil && p.warmupEnqueued(disarmer, sig, pcs) {
428442
log.Warnf("rule %s triggered on first period, putting pids to kill on wait list", rule.ID)

pkg/security/probe/remediations_linux.go

Lines changed: 54 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,6 @@ type RemediationProcessContext struct {
6969
PID uint32 `json:"pid,omitempty"`
7070
}
7171

72-
// RemediationContainerContext represents the container context for remediation events
73-
// easyjson:json
74-
type RemediationContainerContext struct {
75-
CreatedAt uint64 `json:"created_at,omitempty"`
76-
ID string `json:"id,omitempty"`
77-
}
78-
7972
// RemediationAgentContext represents the agent context for remediation events
8073
// easyjson:json
8174
type RemediationAgentContext struct {
@@ -109,15 +102,6 @@ func (k RemediationEvent) ToJSON() ([]byte, error) {
109102
return utils.MarshalEasyJSON(k)
110103
}
111104

112-
func getAgentEventID(rule *rules.Rule) string {
113-
for _, tag := range rule.Tags {
114-
if strings.HasPrefix(tag, "agent_event_id:") {
115-
return tag[len("agent_event_id:"):]
116-
}
117-
}
118-
return ""
119-
}
120-
121105
func getRemediationTagBool(rule *rules.Rule) bool {
122106
for _, tag := range rule.Tags {
123107
if strings.HasPrefix(tag, "remediation_rule:") {
@@ -140,27 +124,30 @@ func generateNetworkIsolationActionKey(ruleID string, filter string) string {
140124
return NetworkIsolationKeyPrefix + ruleID + hex.EncodeToString(hash[:])
141125

142126
}
143-
func generateRemediationActionKey(rule *rules.Rule) string {
127+
func generateRemediationActionKey(key string) string {
144128
// prefix + agent_event_id
145-
return RemediationKeyPrefix + getAgentEventID(rule)
129+
return RemediationKeyPrefix + key
146130
}
147131

148132
func getRemediationKeyFromAction(rule *rules.Rule, action *rules.Action) string {
149-
if getRemediationTagBool(rule) {
150-
// Should not have multiple remediation actions for the same rule that were generated by customers (remediation)
151-
return generateRemediationActionKey(rule)
152-
}
133+
key := ""
134+
153135
// Having multiple actions in the same rulemeans that they are from a rule that was not dynamically generated for the remediation feature
154136
// We assume this combination unique
155137
if action.Def.Kill != nil {
156138
// ruleID + scope + signal
157-
return generateKillActionKey(rule.ID, action.Def.Kill.Scope, action.Def.Kill.Signal)
158-
}
159-
if action.Def.NetworkFilter != nil {
139+
key = generateKillActionKey(rule.ID, action.Def.Kill.Scope, action.Def.Kill.Signal)
140+
} else if action.Def.NetworkFilter != nil {
160141
// ruleID + bpffilter
161-
return generateNetworkIsolationActionKey(rule.ID, action.Def.NetworkFilter.BPFFilter)
142+
key = generateNetworkIsolationActionKey(rule.ID, action.Def.NetworkFilter.BPFFilter)
143+
}
144+
145+
if getRemediationTagBool(rule) {
146+
// Should not have multiple remediation actions for the same rule that were generated by customers (remediation)
147+
return generateRemediationActionKey(key)
162148
}
163-
return ""
149+
150+
return key
164151
}
165152

166153
// NewRemediationEvent creates a new Remediation event from the latest action report
@@ -211,6 +198,17 @@ func getTagsFromRule(rule *rules.Rule) RuleTags {
211198
return ruleTags
212199
}
213200

201+
// tagsToRuleTags converts a slice of "key:value" tags into RuleTags map.
202+
func tagsToRuleTags(tags []string) RuleTags {
203+
ruleTags := make(RuleTags)
204+
for _, tag := range tags {
205+
if before, after, ok := strings.Cut(tag, ":"); ok {
206+
ruleTags[before] = after
207+
}
208+
}
209+
return ruleTags
210+
}
211+
214212
// HandleRemediationStatus is called when a new ruleset is loaded
215213
// It cleans up the activeRemediations map from the kill actions and network isolation actions that are not persistent
216214
func (p *EBPFProbe) HandleRemediationStatus(rs *rules.RuleSet) {
@@ -278,7 +276,33 @@ func (p *EBPFProbe) HandleRemediationStatus(rs *rules.RuleSet) {
278276
}
279277
}
280278

281-
func (p *EBPFProbe) HandleKillRemediation(rule *rules.Rule, ev *model.Event, report *KillActionReport, action *rules.Action) {
279+
// SendCustomEventKillAction sends a custom remediation event for a resolved kill action report
280+
func (p *EBPFProbe) SendCustomEventKillAction(report model.ActionReport, tags []string) {
281+
killReport, ok := report.(*KillActionReport)
282+
if !ok {
283+
return
284+
}
285+
killReport.RLock()
286+
status := string(killReport.Status)
287+
scope := killReport.Scope
288+
pid := killReport.Pid
289+
killReport.RUnlock()
290+
291+
containerContext := killReport.GetRemediationContainerContext()
292+
293+
remediation := &Remediation{
294+
actionType: RemediationTypeKill,
295+
triggered: true,
296+
scope: scope,
297+
containerContext: containerContext,
298+
processContext: RemediationProcessContext{PID: pid},
299+
ruleTags: tagsToRuleTags(tags),
300+
}
301+
re := NewRemediationEvent(p, remediation, status, RemediationTypeKillStr)
302+
p.SendRemediationEvent(re)
303+
}
304+
305+
func (p *EBPFProbe) HandleKillRemediation(rule *rules.Rule, ev *model.Event, action *rules.Action) {
282306
remediationKey := getRemediationKeyFromAction(rule, action)
283307
p.activeRemediationsLock.Lock()
284308
defer p.activeRemediationsLock.Unlock()
@@ -292,30 +316,9 @@ func (p *EBPFProbe) HandleKillRemediation(rule *rules.Rule, ev *model.Event, rep
292316
remediation.policy = ""
293317
remediation.ruleTags = getTagsFromRule(rule)
294318

295-
} else {
296-
// Don't create a new entry for kill actions that are not from the remediation feature
297-
// It will only be used to send an event
298-
remediation = &Remediation{
299-
actionType: RemediationTypeKill,
300-
triggered: true,
301-
processContext: RemediationProcessContext{
302-
PID: ev.ProcessContext.Process.Pid,
303-
},
304-
containerContext: RemediationContainerContext{
305-
ID: string(ev.ProcessContext.Process.ContainerContext.ContainerID),
306-
CreatedAt: ev.ProcessContext.Process.ContainerContext.CreatedAt,
307-
},
308-
scope: action.Def.Kill.Scope,
309-
}
310319
}
311-
312-
// Get kill status
313-
report.RLock()
314-
status := string(report.Status)
315-
report.RUnlock()
316-
// Send custom event
317-
killActionEvent := NewRemediationEvent(p, remediation, status, RemediationTypeKillStr)
318-
p.SendRemediationEvent(killActionEvent)
320+
// Don't send an event for kill action here
321+
// The event will be sent when the report is resolved to handle the cases where disarmers are used
319322
}
320323

321324
func (p *EBPFProbe) HandleNetworkRemediation(rule *rules.Rule, ev *model.Event, report *RawPacketActionReport, action *rules.Action) {

pkg/security/probe/remediations_linux_easyjson.go

Lines changed: 2 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)