Skip to content

Commit 29f3b51

Browse files
committed
three programs.
and tears. today was hard. Signed-off-by: vsoch <[email protected]>
1 parent 9e9aad1 commit 29f3b51

File tree

9 files changed

+1196
-26
lines changed

9 files changed

+1196
-26
lines changed

README.md

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -540,22 +540,80 @@ For this setup, you'll see `flux submit` so the jobs will run at the same time o
540540

541541
##### 3. Monitor with BCC
542542

543-
This setup will deploy a sidecar and monitor different interacts with bcc.
543+
This setup will deploy a sidecar and monitor different interacts with bcc. We have several programs that help to understand tcp, file open/closes, or futex wait times. Although you can filter to a cgroup or command, for the default we allow all containers in the pod to be seen. It generates a lot more data, but is interesting. This might need to be tweaked for actual experiments if the data is too big. The default is file open/close (you could leave out `monitor_program`) below:
544544

545545
```bash
546546
helm install \
547+
--set experiment.monitor_program=open-close \
547548
--set experiment.monitor=true \
548549
--set minicluster.save_logs=true \
549550
lammps ./lammps-reax
550551
```
551552

552-
You'll need to look at the logs to see the sidecar vs. lammps.
553+
<details>
554+
555+
```console
556+
Looking for /opt/programs/open-close/ebpf-collect.c
557+
Starting eBPF (Tracepoint for open entry).
558+
559+
Start Indicator file defined '/mnt/flux/start_ebpf_collection'. Waiting.
560+
{"event": "OPEN", "command": "python3", "retval": 12, "ts_sec": 779.540036095, "tgid": 0, "tid": 14554, "ppid": 14554, "cgroup_id": 0, "filename": "/sys/bus/event_source/devices/kprobe/type"}
561+
{"event": "OPEN", "command": "python3", "retval": 12, "ts_sec": 779.540051234, "tgid": 0, "tid": 14554, "ppid": 14554, "cgroup_id": 0, "filename": "/sys/bus/event_source/devices/kprobe/format/retprobe"}
562+
{"event": "OPEN", "command": "containerd", "retval": 193, "ts_sec": 779.629315113, "tgid": 0, "tid": 3600, "ppid": 3618, "cgroup_id": 0, "filename": "/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/204/fs"}
563+
{"event": "CLOSE", "command": "containerd", "retval": 0, "ts_sec": 779.629342785, "tgid": 1, "tid": 3600, "ppid": 3618, "cgroup_id": 6520}
564+
...
565+
{"event": "OPEN", "command": "touch", "retval": 3, "ts_sec": 803.043308743, "tgid": 0, "tid": 14883, "ppid": 14883, "cgroup_id": 3257288213055174703, "filename": "/usr/lib/locale/C.utf8/LC_NUMERIC"}
566+
{"event": "CLOSE", "command": "touch", "retval": 0, "ts_sec": 803.043310733, "tgid": 14414, "tid": 14883, "ppid": 14883, "cgroup_id": 13176}
567+
{"event": "OPEN", "command": "touch", "retval": 3, "ts_sec": 803.043316595, "tgid": 0, "tid": 14883, "ppid": 14883, "cgroup_id": 3257288213055174703, "filename": "/usr/lib/locale/C.utf8/LC_CTYPE"}
568+
{"event": "CLOSE", "command": "touch", "retval": 0, "ts_sec": 803.043318514, "tgid": 14414, "tid": 14883, "ppid": 14883, "cgroup_id": 13176}
569+
{"event": "OPEN", "command": "touch", "retval": 3, "ts_sec": 803.043359627, "tgid": 0, "tid": 14883, "ppid": 14883, "cgroup_id": 3257288213055174703, "filename": "/mnt/flux/stop_ebpf_collection"}
570+
{"event": "CLOSE", "command": "touch", "retval": 0, "ts_sec": 803.043360931, "tgid": 14414, "tid": 14883, "ppid": 14883, "cgroup_id": 13176}
571+
572+
Indicator file '/mnt/flux/stop_ebpf_collection' found. Stopping.
573+
Cleaning up BPF resources...
574+
```
575+
576+
</details>
577+
578+
Here is how to see futex wait times.
579+
580+
```bash
581+
helm install \
582+
--set experiment.monitor_program=futex \
583+
--set experiment.monitor=true \
584+
--set minicluster.save_logs=true \
585+
lammps ./lammps-reax
586+
```
587+
588+
This likely needs to be consolidated (it's a lot of data). Here is an example.
589+
590+
```bash
591+
{"event_type": "FUTEX_WAIT_END", "timestamp_sec": 3891.635076022, "tgid": 34931, "tid": 35146, "comm": "containerd-shim", "cgroup_id": 6520, "futex_op_full": 128, "futex_op_str": "FUTEX_WAIT_PRIVATE", "wait_duration_ns": 21462, "wait_duration_human": "21.46us"}
592+
{"event_type": "FUTEX_WAIT_END", "timestamp_sec": 3891.635086712, "tgid": 34931, "tid": 35388, "comm": "containerd-shim", "cgroup_id": 6520, "futex_op_full": 128, "futex_op_str": "FUTEX_WAIT_PRIVATE", "wait_duration_ns": 42892, "wait_duration_human": "42.89us"}
593+
{"event_type": "FUTEX_WAIT_END", "timestamp_sec": 3891.635102633, "tgid": 3600, "tid": 3602, "comm": "containerd", "cgroup_id": 6520, "futex_op_full": 128, "futex_op_str": "FUTEX_WAIT_PRIVATE", "wait_duration_ns": 16693, "wait_duration_human": "16.69us"}
594+
{"event_type": "FUTEX_WAIT_END", "timestamp_sec": 3891.635107428, "tgid": 3600, "tid": 6823, "comm": "containerd", "cgroup_id": 6520, "futex_op_full": 128, "futex_op_str": "FUTEX_WAIT_PRIVATE", "wait_duration_ns": 34781, "wait_duration_human": "34.78us"}
595+
```
596+
597+
Finally, here is tcp
598+
599+
```bash
600+
helm install \
601+
--set experiment.monitor_program=tcp \
602+
--set experiment.monitor=true \
603+
--set minicluster.save_logs=true \
604+
lammps ./lammps-reax
605+
```
606+
607+
Here is example data:
553608

554609
```bash
555-
kubeclt lo
610+
{"event_type": "RECV", "timestamp_sec": 3978.855691216, "tgid": 36693, "tid": 36693, "comm": "lmp", "cgroup_id": 19947, "fd": 11, "bytes": 20, "bytes_human": "20", "duration_ns": 13430, "duration_human": "13.43us"}
611+
{"event_type": "RECV", "timestamp_sec": 3978.855698151, "tgid": 36693, "tid": 36693, "comm": "lmp", "cgroup_id": 19947, "fd": 11, "bytes": 20, "bytes_human": "20", "duration_ns": 1920, "duration_human": "1.92us"}
612+
{"event_type": "RECV", "timestamp_sec": 3978.855731232, "tgid": 36693, "tid": 36693, "comm": "lmp", "cgroup_id": 19947, "fd": 11, "bytes": 20, "bytes_human": "20", "duration_ns": 12363, "duration_human": "12.36us"}
613+
{"event_type": "RECV", "timestamp_sec": 3978.855737251, "tgid": 36693, "tid": 36693, "comm": "lmp", "cgroup_id":
556614
```
557615

558-
Try changing the command:
616+
Also try changing the command entirely.
559617

560618
```bash
561619
helm install \
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
#include <uapi/linux/ptrace.h>
2+
#include <linux/sched.h>
3+
// Do NOT include <linux/futex.h> here as it can cause compilation issues with BCC
4+
5+
// Define FUTEX constants directly in BPF C code
6+
// These values are from standard Linux UAPI headers (e.g., <uapi/linux/futex.h>)
7+
#ifndef FUTEX_WAIT
8+
#define FUTEX_WAIT 0
9+
#endif
10+
#ifndef FUTEX_WAKE
11+
#define FUTEX_WAKE 1
12+
#endif
13+
// Add other FUTEX_ op codes if we want to trace them, e.g.
14+
// #ifndef FUTEX_LOCK_PI
15+
// #define FUTEX_LOCK_PI 6
16+
// #endif
17+
18+
// Flags for futex op
19+
#ifndef FUTEX_PRIVATE_FLAG
20+
#define FUTEX_PRIVATE_FLAG 128
21+
#endif
22+
#ifndef FUTEX_CLOCK_REALTIME
23+
#define FUTEX_CLOCK_REALTIME 256 // Note: If this flag is set, the 'op' argument structure changes for timeout
24+
#endif
25+
26+
// Mask to get the command part of the futex op
27+
// FUTEX_CMD_MASK should correctly isolate the command part from flags.
28+
// A common way to get the command is (op & 0x7F) or (op & ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME))
29+
// The definition of FUTEX_CMD_MASK here assumes we only care about private/realtime flags for masking.
30+
// If other flags are present in the lower bits that are not part of the command, this might need adjustment.
31+
// For simplicity, we often just check `(op & 0x7F) == FUTEX_WAIT` or `op == FUTEX_WAIT` or `op == FUTEX_WAIT_PRIVATE`.
32+
// Let's use a simpler direct check for FUTEX_WAIT and FUTEX_WAIT_PRIVATE.
33+
#ifndef FUTEX_WAIT_PRIVATE
34+
#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
35+
#endif
36+
37+
38+
#define TASK_COMM_LEN_EBPF 16
39+
40+
enum event_type {
41+
EVENT_FUTEX_WAIT_COMPLETED = 0,
42+
};
43+
44+
struct data_t {
45+
u64 timestamp_ns;
46+
u32 tgid;
47+
u32 tid;
48+
// u32 ppid; // Add back if needed and fetched in BPF
49+
u64 cgroup_id;
50+
char comm[TASK_COMM_LEN_EBPF];
51+
enum event_type type;
52+
int futex_op_full; // The original futex op argument (including flags)
53+
u64 wait_duration_ns;
54+
};
55+
BPF_RINGBUF_OUTPUT(events, 8);
56+
57+
BPF_HASH(futex_start_times, u32, u64); // Key: TID, Value: Start timestamp
58+
BPF_HASH(futex_ops_temp, u32, int); // Key: TID, Value: Original futex_op_full from enter
59+
60+
// --- Debugging (Optional) ---
61+
enum debug_stage {
62+
DBG_FUTEX_ENTER_TRACKING = 400,
63+
DBG_FUTEX_ENTER_NOT_TRACKING = 401,
64+
DBG_FUTEX_EXIT_FOUND_START = 402,
65+
DBG_FUTEX_EXIT_NO_START = 403,
66+
DBG_FUTEX_SUBMITTED = 404,
67+
DBG_FUTEX_RESERVE_FAIL = 405
68+
};
69+
struct debug_event_t {
70+
u32 id_tid; // Using u32 for TID
71+
enum debug_stage stage;
72+
long val1_op;
73+
long val2_duration_or_misc;
74+
};
75+
BPF_RINGBUF_OUTPUT(debug_events_rb, 4);
76+
77+
static __always_inline void send_futex_debug_event(u32 current_tid, enum debug_stage stage, long v1, long v2) {
78+
struct debug_event_t *dbg_evt = debug_events_rb.ringbuf_reserve(sizeof(struct debug_event_t));
79+
if (dbg_evt) {
80+
dbg_evt->id_tid = current_tid;
81+
dbg_evt->stage = stage;
82+
dbg_evt->val1_op = v1;
83+
dbg_evt->val2_duration_or_misc = v2;
84+
debug_events_rb.ringbuf_submit(dbg_evt, 0);
85+
}
86+
}
87+
// --- End Debugging ---
88+
89+
TRACEPOINT_PROBE(syscalls, sys_enter_futex) {
90+
u64 pid_tgid = bpf_get_current_pid_tgid();
91+
u32 tid = (u32)pid_tgid;
92+
93+
// 'args->op' is how BCC TRACEPOINT_PROBE typically exposes tracepoint arguments
94+
// that match the 'op' field in /sys/kernel/debug/tracing/events/syscalls/sys_enter_futex/format
95+
int futex_op_full = args->op;
96+
int futex_op_cmd = futex_op_full & ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME); // Get command part
97+
98+
// We are interested in FUTEX_WAIT or FUTEX_WAIT_PRIVATE
99+
if (futex_op_cmd == FUTEX_WAIT) {
100+
u64 start_time_ns = bpf_ktime_get_ns();
101+
futex_start_times.update(&tid, &start_time_ns);
102+
futex_ops_temp.update(&tid, &futex_op_full); // Store the original op
103+
// send_futex_debug_event(tid, DBG_FUTEX_ENTER_TRACKING, futex_op_full, 0);
104+
} else {
105+
// send_futex_debug_event(tid, DBG_FUTEX_ENTER_NOT_TRACKING, futex_op_full, 0);
106+
}
107+
return 0;
108+
}
109+
110+
TRACEPOINT_PROBE(syscalls, sys_exit_futex) {
111+
u64 pid_tgid = bpf_get_current_pid_tgid();
112+
u32 tgid = pid_tgid >> 32;
113+
u32 tid = (u32)pid_tgid;
114+
115+
u64 *start_time_ns_ptr;
116+
start_time_ns_ptr = futex_start_times.lookup(&tid);
117+
118+
if (start_time_ns_ptr) {
119+
u64 end_time_ns = bpf_ktime_get_ns();
120+
u64 duration_ns = end_time_ns - *start_time_ns_ptr;
121+
122+
int *original_op_ptr = futex_ops_temp.lookup(&tid);
123+
int original_op = -1; // Default if not found (shouldn't happen if start_time was found)
124+
if (original_op_ptr) {
125+
original_op = *original_op_ptr;
126+
}
127+
128+
futex_start_times.delete(&tid);
129+
futex_ops_temp.delete(&tid);
130+
131+
// send_futex_debug_event(tid, DBG_FUTEX_EXIT_FOUND_START, original_op, duration_ns);
132+
133+
struct data_t *event_data_ptr = events.ringbuf_reserve(sizeof(*event_data_ptr));
134+
if (!event_data_ptr) {
135+
// send_futex_debug_event(tid, DBG_FUTEX_RESERVE_FAIL, original_op, 0);
136+
return 0;
137+
}
138+
139+
event_data_ptr->timestamp_ns = end_time_ns;
140+
event_data_ptr->tgid = tgid;
141+
event_data_ptr->tid = tid;
142+
event_data_ptr->cgroup_id = bpf_get_current_cgroup_id();
143+
bpf_get_current_comm(&event_data_ptr->comm, sizeof(event_data_ptr->comm));
144+
event_data_ptr->comm[TASK_COMM_LEN_EBPF - 1] = '\0';
145+
event_data_ptr->type = EVENT_FUTEX_WAIT_COMPLETED;
146+
event_data_ptr->futex_op_full = original_op;
147+
event_data_ptr->wait_duration_ns = duration_ns;
148+
149+
events.ringbuf_submit(event_data_ptr, 0);
150+
// send_futex_debug_event(tid, DBG_FUTEX_SUBMITTED, original_op, duration_ns);
151+
152+
} else {
153+
// send_futex_debug_event(tid, DBG_FUTEX_EXIT_NO_START, args->ret, 0); // args->ret is syscall return
154+
}
155+
return 0;
156+
}
157+

0 commit comments

Comments
 (0)