Skip to content

Commit 6a440bc

Browse files
committed
feat!: default to sched_process_exec tracepoint on all architectures
Leverage `sched/sched_process_exec` tracepoint to generate `execve` exit events for both successful `execve` and `execveat` system calls. For failing calls, use dedicated programs/fillers to generate `execve` and `execveat` exit events. This architectural choice is motivated by the fact that the kernel haven't consistently called the correct tracepoint for `execve` and `execveat` calls on all architectures, as well as haven't consistently identified the correct system call in the tracepoint context: - on `x86_64`, a successful `execveat` call is identified as `execve`, and a failing one is identified as `execveat` - on `aarch64`, till version 5.18 (actually, the fix was back-ported up to 5.15: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.15.y&id=42eede3ae05bbf32cb0d87940b466ec5a76aca3f), neither successful `execve`s nor successful `execveat`s used to trigger the `sys_exit` (see https://www.spinics.net/lists/linux-trace/msg01001.html) tracepoint; only failing ones have always triggered the correct behaviour - on `s390x`, each call correctly triggers `sys_exit` tracepoint and is correctly identified as `execve` and `execveat` Indeed, the `sched/sched_process_exec` is correctly triggered on all architectures for successful calls. Moreover, failing calls correctly trigger the `sys_exit` tracepoint, and are correctly associated to the right syscall number. In the past, this design was applied just for `aarch64`, but since it works consistently on all architectures, its application was extended. The only issue seems to be that now we generate `execve` exit events for both `execve` and `execveat`, on call success: this is not a big problem because, in the previous implementation, for successful `execveat` calls, we were only able to generate `execveat` exit event on `s390x`. For this latter case, users will not be impacted by the new design if they rely on rule conditions matching both `execve` and `execveat` event (e.g.: `... evt.type in (execve, execveat) ...`). This patch rearranges `execve` and `execveat` driver tests, by moving tests related to successful calls in `test/drivers/test_suites/generic_tracepoints_suite/sched_process_exec.cpp`, and keeping tests related to failing calls in `test/drivers/test_suites/syscall_exit_suite/execve_x.cpp` and `test/drivers/test_suites/syscall_exit_suite/execveat_x.cpp`. Old tests relate to new tests in the following way: - `execveX_success` -> `sched_proc_exec_execve` - `execveX_not_upperlayer` -> `sched_proc_exec_execve_not_upperlayer` - `execveX_upperlayer_success` -> `sched_proc_exec_execve_upperlayer` - `execveX_success_memfd` -> `sched_proc_exec_execve_memfd` - `execveX_symlink` -> `sched_proc_exec_execve_symlink` - `execveatX_correct_exit` -> `sched_proc_exec_execveat` - `execveatX_execve_exit` -> `sched_proc_exec_execveat` - `execveatX_execve_exit_comm_equal_to_fd` -> `sched_proc_exec_execveat_comm_equal_to_fd` - `execveatX_success_memfd` -> `sched_proc_exec_execveat_memfd` BREAKING CHANGE: emit `execve` exit event instead of `execveat` exit event in case of successful `execveat` call on `s390x` Signed-off-by: Leonardo Di Giovanna <leonardodigiovanna1@gmail.com>
1 parent e231639 commit 6a440bc

File tree

16 files changed

+1009
-1382
lines changed

16 files changed

+1009
-1382
lines changed

driver/bpf/fillers.h

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,9 @@ FILLER_RAW(terminate_filler) {
219219
}
220220
break;
221221
case PPM_SKIP_EVENT:
222+
bpf_printk("PPM_SKIP_EVENT event=%d curarg=%d\n",
223+
state->tail_ctx.evt_type,
224+
state->tail_ctx.curarg);
222225
break;
223226
case PPM_FAILURE_FRAME_SCRATCH_MAP_FULL:
224227
bpf_printk("PPM_FAILURE_FRAME_SCRATCH_MAP_FULL event=%d curarg=%d\n",
@@ -2267,11 +2270,21 @@ FILLER(proc_startupdate, true) {
22672270
pid_t pid;
22682271
int res;
22692272

2273+
retval = bpf_syscall_get_retval(data->ctx);
2274+
22702275
/*
2271-
* Make sure the operation was successful
2276+
* For `execve` and `execveat`, the only purpose of this filler is to catch events in case of
2277+
* system call failure. In case of system call success, `execve` and `execveat` events are
2278+
* caught by our tracepoint on `sched/sched_process_exec` (see comment on
2279+
* `sched_proc_exec_probe` in `driver/bpf/probe.c`). A successful `execve`/`execveat` call is
2280+
* identified by `retval == 0`.
22722281
*/
2282+
if(retval == 0 && (data->state->tail_ctx.evt_type == PPME_SYSCALL_EXECVE_19_X ||
2283+
data->state->tail_ctx.evt_type == PPME_SYSCALL_EXECVEAT_X)) {
2284+
return PPM_SKIP_EVENT;
2285+
}
2286+
22732287
/* Parameter 1: res (type: PT_ERRNO) */
2274-
retval = bpf_syscall_get_retval(data->ctx);
22752288
res = bpf_push_s64_to_ring(data, retval);
22762289
CHECK_RES(res);
22772290

@@ -6715,7 +6728,6 @@ FILLER(sys_getdents64_x, true) {
67156728
return bpf_push_s64_to_ring(data, fd);
67166729
}
67176730

6718-
#ifdef CAPTURE_SCHED_PROC_EXEC
67196731
/* We set `is_syscall` flag to `false` since this is not
67206732
* a real syscall, we only send the same event from another
67216733
* tracepoint.
@@ -7111,8 +7123,6 @@ FILLER(sched_prog_exec_5, false) {
71117123
return bpf_push_u32_to_ring(data, egid.val);
71127124
}
71137125

7114-
#endif
7115-
71167126
#ifdef CAPTURE_SCHED_PROC_FORK
71177127
/* These `sched_proc_fork` fillers will generate a
71187128
* `PPME_SYSCALL_CLONE_20_X` event.

driver/bpf/plumbing_helpers.h

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,6 @@ static __always_inline int bpf_test_bit(int nr, unsigned long *addr) {
548548
return 1UL & (_READ(addr[BIT_WORD(nr)]) >> (nr & (BITS_PER_LONG - 1)));
549549
}
550550

551-
#if defined(CAPTURE_SCHED_PROC_FORK) || defined(CAPTURE_SCHED_PROC_EXEC)
552551
static __always_inline bool bpf_drop_syscall_exit_events(void *ctx, ppm_event_code evt_type) {
553552
long ret = 0;
554553
switch(evt_type) {
@@ -569,23 +568,19 @@ static __always_inline bool bpf_drop_syscall_exit_events(void *ctx, ppm_event_co
569568
return ret == 0;
570569
#endif
571570

572-
/* If `CAPTURE_SCHED_PROC_EXEC` logic is enabled we collect execve-family
573-
* exit events through a dedicated tracepoint so we can ignore them here.
574-
*/
575-
#ifdef CAPTURE_SCHED_PROC_EXEC
576571
case PPME_SYSCALL_EXECVE_19_X:
577572
case PPME_SYSCALL_EXECVEAT_X:
573+
/* We collect execve-family successful exit events through a dedicated `sched_process_exec`
574+
* tracepoint , so we can ignore them here.
575+
*/
578576
ret = bpf_syscall_get_retval(ctx);
579-
/* We ignore only successful events, so ret == 0! */
580577
return ret == 0;
581-
#endif
582578

583579
default:
584580
break;
585581
}
586582
return false;
587583
}
588-
#endif
589584

590585
static __always_inline bool drop_event(void *ctx,
591586
struct scap_bpf_per_cpu_state *state,

driver/bpf/probe.c

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -207,10 +207,8 @@ BPF_PROBE("raw_syscalls/", sys_exit, sys_exit_args) {
207207
}
208208
}
209209

210-
#if defined(CAPTURE_SCHED_PROC_FORK) || defined(CAPTURE_SCHED_PROC_EXEC)
211210
if(bpf_drop_syscall_exit_events(ctx, evt_type))
212211
return 0;
213-
#endif
214212

215213
call_filler(ctx, ctx, evt_type, drop_flags, socketcall_syscall_id);
216214
return 0;
@@ -289,7 +287,30 @@ __bpf_section(TP_NAME "sched/sched_process_fork&1") int bpf_sched_process_fork(
289287
}
290288
#endif
291289

292-
#ifdef CAPTURE_SCHED_PROC_EXEC
290+
/*
291+
* This tracepoint generates `execve` exit events for both successful `execve` and `execveat` system
292+
* calls. Event related to system calls failures, for both system calls, are generated by
293+
* `proc_startupdate` filler. This architectural choice is motivated by the fact that the kernel
294+
* haven't consistently called the correct tracepoint for `execve` and `execveat` calls on all
295+
* architectures:
296+
* - on `x86_64`, a successful `execveat` call is identified as `execve`, and a failing one is
297+
* identified as `execveat`
298+
* - on `aarch64`, till version 5.18 (actually, the fix was back-ported up to 5.15:
299+
* https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.15.y&id=42eede3ae05bbf32cb0d87940b466ec5a76aca3f),
300+
* neither successful `execve`s nor successful `execveat`s used to trigger the `sys_exit` (see
301+
* https://www.spinics.net/lists/linux-trace/msg01001.html) tracepoint; only failing ones have
302+
* always triggered the correct behaviour
303+
* - on `s390x`, each call correctly triggers `sys_exit` tracepoint and is correctly identified as
304+
* `execve` and `execveat`
305+
* Indeed, the `sched/sched_process_exec` is correctly triggered on all architectures for successful
306+
* calls. Moreover, failing calls correctly trigger the `sys_exit` tracepoint, and are correctly
307+
* associated to the right syscall number. In the past, this design was applied just for `aarch64`,
308+
* but since it works consistently on all architectures, its application was extended. The only
309+
* issue seems to be that now we generates `execve` exit events for both `execve` and `execveat` if
310+
* the call is successful: this is not a big problem, because even with the previous implementation
311+
* we weren't able to generate any `execveat` exit event on both `x86_64` and `aarch64` for this
312+
* scenario.
313+
*/
293314
BPF_PROBE("sched/", sched_process_exec, sched_process_exec_args) {
294315
struct scap_bpf_settings *settings;
295316
/* We will always send an execve exit event. */
@@ -324,7 +345,6 @@ BPF_PROBE("sched/", sched_process_exec, sched_process_exec_args) {
324345
filler_code);
325346
return 0;
326347
}
327-
#endif /* CAPTURE_SCHED_PROC_EXEC */
328348

329349
#ifdef CAPTURE_SCHED_PROC_FORK
330350
__bpf_section("raw_tracepoint/sched_process_fork&2") int bpf_sched_process_fork(

driver/bpf/types.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,6 @@ struct sys_stash_args {
140140
};
141141
#endif
142142

143-
#ifdef CAPTURE_SCHED_PROC_EXEC
144-
145143
#ifndef BPF_SUPPORTS_RAW_TRACEPOINTS
146144
struct sched_process_exec_args {
147145
unsigned short common_type;
@@ -163,8 +161,6 @@ struct sched_process_exec_args {
163161
};
164162
#endif /* BPF_SUPPORTS_RAW_TRACEPOINTS */
165163

166-
#endif /* CAPTURE_SCHED_PROC_EXEC */
167-
168164
#ifdef CAPTURE_SCHED_PROC_FORK
169165
/* TP_PROTO(struct task_struct *parent, struct task_struct *child)
170166
* Taken from `/include/trace/events/sched.h`

driver/feature_gates.h

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -65,37 +65,6 @@ or GPL2.txt for full copies of the license.
6565
#define CAPTURE_SCHED_PROC_FORK
6666
#endif
6767

68-
///////////////////////////////
69-
// CAPTURE_SCHED_PROC_EXEC
70-
///////////////////////////////
71-
72-
/* In some architectures we are not able to catch the `execve exit event`
73-
* from the `sys_exit` tracepoint. This is because there is no
74-
* default behavior among different architectures... you can find more
75-
* info here:
76-
* https://www.spinics.net/lists/linux-trace/msg01001.html
77-
*
78-
* Anyway, to not lose this event, we need to instrument a new kernel tracepoint:
79-
*
80-
* - `sched_process_exec`: allows us to catch every process that correctly performs
81-
* an `execve` call.
82-
*
83-
* In this way we can send to userspace a `PPME_SYSCALL_EXECVE_X` event
84-
* as we do with the `sys_exit` tracepoint.
85-
*
86-
* All the architectures that need this patch can use our BPF probe with all
87-
* supported kernel versions (so >= `4.14`), since `BPF_PROG_TYPE_RAW_TRACEPOINT` are
88-
* not required in this case.
89-
*
90-
* If you run old kernels, you can use the kernel module which requires
91-
* kernel versions greater or equal than `3.4`, since this tracepoint has
92-
* been introduced in the following kernel release:
93-
* https://github.com/torvalds/linux/commit/4ff16c25e2cc48cbe6956e356c38a25ac063a64d
94-
*/
95-
#if defined(CONFIG_ARM64)
96-
#define CAPTURE_SCHED_PROC_EXEC
97-
#endif
98-
9968
///////////////////////////////
10069
// CAPTURE_64BIT_ARGS_SINGLE_REGISTER
10170
///////////////////////////////
@@ -142,14 +111,6 @@ or GPL2.txt for full copies of the license.
142111

143112
#elif defined(__USE_VMLINUX__) /* modern BPF probe */
144113

145-
///////////////////////////////
146-
// CAPTURE_SCHED_PROC_EXEC
147-
///////////////////////////////
148-
149-
#if defined(__TARGET_ARCH_arm64)
150-
#define CAPTURE_SCHED_PROC_EXEC
151-
#endif
152-
153114
///////////////////////////////
154115
// CAPTURE_SCHED_PROC_FORK
155116
///////////////////////////////
@@ -209,14 +170,6 @@ or GPL2.txt for full copies of the license.
209170
#define CAPTURE_SCHED_PROC_FORK
210171
#endif
211172

212-
///////////////////////////////
213-
// CAPTURE_SCHED_PROC_EXEC
214-
///////////////////////////////
215-
216-
#if defined(__aarch64__)
217-
#define CAPTURE_SCHED_PROC_EXEC
218-
#endif
219-
220173
#endif /* __KERNEL__ */
221174

222175
#endif /* FEATURE_GATES_H */

driver/main.c

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -185,12 +185,10 @@ TRACEPOINT_PROBE(page_fault_kern_probe,
185185
TRACEPOINT_PROBE(sched_proc_fork_probe, struct task_struct *parent, struct task_struct *child);
186186
#endif
187187

188-
#ifdef CAPTURE_SCHED_PROC_EXEC
189188
TRACEPOINT_PROBE(sched_proc_exec_probe,
190189
struct task_struct *p,
191190
pid_t old_pid,
192191
struct linux_binprm *bprm);
193-
#endif
194192

195193
extern const int g_ia32_64_map[];
196194

@@ -241,9 +239,7 @@ static bool g_fault_tracepoint_disabled;
241239
static struct tracepoint *tp_sched_proc_fork;
242240
#endif
243241

244-
#ifdef CAPTURE_SCHED_PROC_EXEC
245242
static struct tracepoint *tp_sched_proc_exec;
246-
#endif
247243

248244
#ifdef _DEBUG
249245
static bool verbose = 1;
@@ -705,14 +701,12 @@ static int force_tp_set(struct ppm_consumer_t *consumer, uint32_t new_tp_set) {
705701
new_val);
706702
break;
707703
#endif
708-
#ifdef CAPTURE_SCHED_PROC_EXEC
709704
case KMOD_PROG_SCHED_PROC_EXEC:
710705
ret = compat_set_tracepoint(sched_proc_exec_probe,
711706
kmod_prog_names[idx],
712707
tp_sched_proc_exec,
713708
new_val);
714709
break;
715-
#endif
716710
default:
717711
// unmanaged idx
718712
break;
@@ -1825,11 +1819,9 @@ static int record_event_consumer(struct ppm_consumer_t *consumer,
18251819
* we need to call dedicated fillers that are not in our `g_ppm_events` table.
18261820
*/
18271821
switch(event_datap->category) {
1828-
#ifdef CAPTURE_SCHED_PROC_EXEC
18291822
case PPMC_SCHED_PROC_EXEC:
18301823
cbres = f_sched_prog_exec(&args);
18311824
break;
1832-
#endif
18331825

18341826
#ifdef CAPTURE_SCHED_PROC_FORK
18351827
case PPMC_SCHED_PROC_FORK:
@@ -1920,6 +1912,10 @@ static int record_event_consumer(struct ppm_consumer_t *consumer,
19201912
} else if(cbres == PPM_FAILURE_BUFFER_FULL) {
19211913
ring_info->n_drops_buffer++;
19221914
drops_buffer_syscall_categories_counters(event_type, ring_info);
1915+
} else if(cbres == PPM_SKIP_EVENT) {
1916+
#ifdef _DEBUG
1917+
pr_err("Skipped event %d\n", event_type);
1918+
#endif
19231919
} else {
19241920
ring_info->n_drops_buffer++;
19251921
ASSERT(false);
@@ -2087,15 +2083,12 @@ static __always_inline bool kmod_drop_syscall_exit_events(long ret, ppm_event_co
20872083
return ret == 0;
20882084
#endif
20892085

2090-
/* If `CAPTURE_SCHED_PROC_EXEC` logic is enabled we collect execve-family
2091-
* exit events through a dedicated tracepoint so we can ignore them here.
2092-
*/
2093-
#ifdef CAPTURE_SCHED_PROC_EXEC
20942086
case PPME_SYSCALL_EXECVE_19_X:
20952087
case PPME_SYSCALL_EXECVEAT_X:
2096-
/* We ignore only successful events, so ret == 0! */
2088+
/* We collect execve-family successful exit events through a dedicated `sched_process_exec`
2089+
* tracepoint , so we can ignore them here.
2090+
*/
20972091
return ret == 0;
2098-
#endif
20992092
default:
21002093
break;
21012094
}
@@ -2176,10 +2169,8 @@ TRACEPOINT_PROBE(syscall_exit_probe, struct pt_regs *regs, long ret) {
21762169

21772170
event_pair = &g_syscall_table[table_index];
21782171

2179-
#if defined(CAPTURE_SCHED_PROC_FORK) || defined(CAPTURE_SCHED_PROC_EXEC)
21802172
if(kmod_drop_syscall_exit_events(ret, event_pair->exit_event_type))
21812173
return;
2182-
#endif
21832174

21842175
if(event_pair->exit_event_type == PPME_SOCKET_SENDMMSG_X ||
21852176
event_pair->exit_event_type == PPME_SOCKET_RECVMMSG_X) {
@@ -2347,7 +2338,30 @@ TRACEPOINT_PROBE(page_fault_kern_probe,
23472338
}
23482339
#endif
23492340

2350-
#ifdef CAPTURE_SCHED_PROC_EXEC
2341+
/*
2342+
* This tracepoint generates `execve` exit events for both successful `execve` and `execveat` system
2343+
* calls. Event related to system calls failures, for both system calls, are generated by
2344+
* `proc_startupdate` filler. This architectural choice is motivated by the fact that the kernel
2345+
* haven't consistently called the correct tracepoint for `execve` and `execveat` calls on all
2346+
* architectures:
2347+
* - on `x86_64`, a successful `execveat` call is identified as `execve`, and a failing one is
2348+
* identified as `execveat`
2349+
* - on `aarch64`, till version 5.18 (actually, the fix was back-ported up to 5.15:
2350+
* https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.15.y&id=42eede3ae05bbf32cb0d87940b466ec5a76aca3f),
2351+
* neither successful `execve`s nor successful `execveat`s used to trigger the `sys_exit` (see
2352+
* https://www.spinics.net/lists/linux-trace/msg01001.html) tracepoint; only failing ones have
2353+
* always triggered the correct behaviour
2354+
* - on `s390x`, each call correctly triggers `sys_exit` tracepoint and is correctly identified as
2355+
* `execve` and `execveat`
2356+
* Indeed, the `sched/sched_process_exec` is correctly triggered on all
2357+
* architectures for successful calls. Moreover, failing calls correctly trigger the `sys_exit`
2358+
* tracepoint, and are correctly associated to the right syscall number. In the past, this design
2359+
* was applied just for `aarch64`, but since it works consistently on all architectures, its
2360+
* application was extended. The only issue seems to be that now we generates `execve` exit events
2361+
* for both `execve` and `execveat` if the call is successful: this is not a big problem, because
2362+
* even with the previous implementation we weren't able to generate any `execveat` exit event on
2363+
* both `x86_64` and `aarch64` for this scenario.
2364+
*/
23512365
TRACEPOINT_PROBE(sched_proc_exec_probe,
23522366
struct task_struct *p,
23532367
pid_t old_pid,
@@ -2367,7 +2381,6 @@ TRACEPOINT_PROBE(sched_proc_exec_probe,
23672381
&event_data,
23682382
KMOD_PROG_SCHED_PROC_EXEC);
23692383
}
2370-
#endif
23712384

23722385
#ifdef CAPTURE_SCHED_PROC_FORK
23732386
TRACEPOINT_PROBE(sched_proc_fork_probe, struct task_struct *parent, struct task_struct *child) {
@@ -2511,10 +2524,8 @@ static void visit_tracepoint(struct tracepoint *tp, void *priv) {
25112524
tp_page_fault_kernel = tp;
25122525
#endif
25132526

2514-
#ifdef CAPTURE_SCHED_PROC_EXEC
25152527
else if(!strcmp(tp->name, kmod_prog_names[KMOD_PROG_SCHED_PROC_EXEC]))
25162528
tp_sched_proc_exec = tp;
2517-
#endif
25182529

25192530
#ifdef CAPTURE_SCHED_PROC_FORK
25202531
else if(!strcmp(tp->name, kmod_prog_names[KMOD_PROG_SCHED_PROC_FORK]))
@@ -2563,12 +2574,10 @@ static int get_tracepoint_handles(void) {
25632574
}
25642575
#endif
25652576

2566-
#ifdef CAPTURE_SCHED_PROC_EXEC
25672577
if(!tp_sched_proc_exec) {
25682578
pr_err("failed to find 'sched_process_exec' tracepoint\n");
25692579
return -ENOENT;
25702580
}
2571-
#endif
25722581

25732582
#ifdef CAPTURE_SCHED_PROC_FORK
25742583
if(!tp_sched_proc_fork) {

0 commit comments

Comments
 (0)