diff --git a/examples/c/.gitignore b/examples/c/.gitignore index 9edf6d75..4490adc1 100644 --- a/examples/c/.gitignore +++ b/examples/c/.gitignore @@ -16,3 +16,7 @@ /lsm /cmake-build-debug/ /cmake-build-release/ +compile_commands.json +/app +/libapp.so +/snooper diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt index a923e749..78eb7f09 100644 --- a/examples/c/CMakeLists.txt +++ b/examples/c/CMakeLists.txt @@ -77,6 +77,7 @@ find_package(BpfObject REQUIRED) file(GLOB apps *.bpf.c) if(NOT CARGO_EXISTS) list(REMOVE_ITEM apps ${CMAKE_CURRENT_SOURCE_DIR}/profile.bpf.c) + list(REMOVE_ITEM apps ${CMAKE_CURRENT_SOURCE_DIR}/snooper.bpf.c) endif() foreach(app ${apps}) get_filename_component(app_stem ${app} NAME_WE) @@ -93,4 +94,10 @@ foreach(app ${apps}) target_link_libraries(${app_stem} ${CMAKE_CURRENT_SOURCE_DIR}/../../blazesym/target/release/libblazesym_c.a -lpthread -lrt -ldl) endif() + if(${app_stem} STREQUAL snooper) + target_include_directories(${app_stem} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../blazesym/capi/include) + target_link_libraries(${app_stem} + ${CMAKE_CURRENT_SOURCE_DIR}/../../blazesym/target/release/libblazesym_c.a -lpthread -lrt -ldl) + endif() endforeach() diff --git a/examples/c/Makefile b/examples/c/Makefile index 912b4e5e..13584c34 100644 --- a/examples/c/Makefile +++ b/examples/c/Makefile @@ -31,7 +31,7 @@ CARGO ?= $(shell which cargo) ifeq ($(strip $(CARGO)),) BZS_APPS := else -BZS_APPS := profile +BZS_APPS := profile snooper APPS += $(BZS_APPS) # Required by libblazesym ALL_LDFLAGS += -lrt -ldl -lpthread -lm @@ -70,12 +70,12 @@ $(call allow-override,CC,$(CROSS_COMPILE)cc) $(call allow-override,LD,$(CROSS_COMPILE)ld) .PHONY: all -all: $(APPS) +all: $(APPS) app .PHONY: clean clean: $(call msg,CLEAN) - $(Q)rm -rf $(OUTPUT) $(APPS) + $(Q)rm -rf $(OUTPUT) $(APPS) app libapp.so $(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): $(call msg,MKDIR,$@) @@ -136,3 +136,12 @@ $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) # keep intermediate (.skel.h, .bpf.o, etc) targets .SECONDARY: + +# Build target app and its shared library +libapp.so: app_lib.c app_lib.h + $(call msg,SHLIB,$@) + $(Q)$(CC) $(CFLAGS) -shared -fPIC -o $@ app_lib.c + +app: app.c app_lib.h libapp.so + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) -o $@ app.c -L. -lapp -Wl,-rpath,'$$ORIGIN' -lpthread diff --git a/examples/c/app.c b/examples/c/app.c new file mode 100644 index 00000000..9038cf8b --- /dev/null +++ b/examples/c/app.c @@ -0,0 +1,140 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "app_lib.h" + +static __thread int tls_dont_care; /* just to avoid zero offsets everywhere else */ + +__thread int tls_exec; +extern __thread int tls_shared; + +static __thread int tls_local_exec; + +int __attribute__((weak)) get_tls_exec(void) +{ + return tls_exec; +} + +int __attribute__((weak)) get_tls_shared(void) +{ + return tls_shared; +} + +int __attribute__((weak)) get_tls_local_exec(void) +{ + return tls_local_exec; +} + +/* Forward declarations for recursive functions */ +void func_a(int depth); +void func_b(int depth); +void func_c(int depth); + +static __always_inline void func_mux(int depth) +{ + if (depth <= 0) + return; + + switch (rand() % 3) { + case 0: func_a(depth - 1); break; + case 1: func_b(depth - 1); break; + case 2: func_c(depth - 1); break; + } +} + +void func_a(int depth) +{ + volatile char stack_space[120]; + stack_space[119] = 'a'; + stack_space[0] += 1; + + if (depth <= 0) + return; + + func_mux(depth - 1); +} + +void func_b(int depth) +{ + volatile char stack_space[350]; + stack_space[349] = 'b'; + stack_space[0] += 1; + + if (depth <= 0) + return; + + func_mux(depth - 1); +} + +void func_c(int depth) +{ + volatile char stack_space[800]; + stack_space[799] = 'c'; + stack_space[0] += 1; + + if (depth <= 0) + return; + + func_mux(depth - 1); +} + +static void *thread_func(void *arg) +{ + time_t last_print = 0; + (void)arg; + + pthread_setname_np(pthread_self(), "app_thread"); + + while (1) { + time_t now; + + errno = 123456789; + func_mux(10); + errno = 987654321; + + now = time(NULL); + if (now > last_print) { + tls_exec += 4; + tls_shared += 8; + tls_local_exec += 16; + bump_tls_local_shared(); + bump_tls_local_shared(); + + printf("Hello from thread (exec=%d, shared=%d, local_exec=%d, local_shared=%d)!\n", + get_tls_exec(), get_tls_shared(), get_tls_local_exec(), get_tls_local_shared()); + last_print = now; + } + } + + return NULL; +} + +int main() { + pthread_t thread; + + pthread_create(&thread, NULL, thread_func, NULL); + + while (1) { + tls_dont_care += 1; + tls_exec += 2; + tls_shared += 4; + tls_local_exec += 8; + bump_tls_local_shared(); + + printf("Hello from app (exec=%d, shared=%d, local_exec=%d, local_shared=%d)!\n", + get_tls_exec(), get_tls_shared(), get_tls_local_exec(), get_tls_local_shared()); + sleep(1); + } + + return 0; +} diff --git a/examples/c/app_lib.c b/examples/c/app_lib.c new file mode 100644 index 00000000..b7c8aaff --- /dev/null +++ b/examples/c/app_lib.c @@ -0,0 +1,8 @@ +__thread int tls_shared; +__thread int tls_shared2; +static __thread int tls_local_shared; + +int get_tls_local_shared(void) { return tls_local_shared; } +int get_tls_shared(void) { return tls_shared; } +int get_tls_shared2(void) { return tls_shared2; } +void bump_tls_local_shared(void) { tls_local_shared += 16; } diff --git a/examples/c/app_lib.h b/examples/c/app_lib.h new file mode 100644 index 00000000..989764b7 --- /dev/null +++ b/examples/c/app_lib.h @@ -0,0 +1,7 @@ +#ifndef APP_LIB_H +#define APP_LIB_H + +int get_tls_local_shared(void); +void bump_tls_local_shared(void); + +#endif /* APP_LIB_H */ diff --git a/examples/c/snooper.bpf.c b/examples/c/snooper.bpf.c new file mode 100644 index 00000000..f7f8b6f6 --- /dev/null +++ b/examples/c/snooper.bpf.c @@ -0,0 +1,767 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2025 Meta Platforms, Inc. */ +#include "vmlinux.h" +#include +#include +#include + +#include "snooper.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +/* Error codes - can't include errno.h in BPF */ +#define ENOENT 2 +#define EOPNOTSUPP 95 +#define EPROTO 71 + +extern int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) __ksym __weak; +extern int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) __ksym __weak; + +/* ========== ELF constants ========== */ + +#define ELFMAG0 0x7f +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' + +#define ELFCLASS64 2 +#define EI_CLASS 4 + +/* ELF types (e_type) */ +#define ET_EXEC 2 /* Executable file */ +#define ET_DYN 3 /* Shared object file */ + +#define SHT_SYMTAB 2 +#define SHT_STRTAB 3 +#define SHT_RELA 4 +#define SHT_DYNSYM 11 + +#define STT_NOTYPE 0 +#define STT_OBJECT 1 +#define STT_FUNC 2 +#define STT_SECTION 3 +#define STT_FILE 4 +#define STT_COMMON 5 +#define STT_TLS 6 + +#define ELF64_ST_TYPE(info) ((info) & 0xf) +#define ELF64_R_SYM(info) ((info) >> 32) +#define ELF64_R_TYPE(info) ((info) & 0xffffffff) + +#define R_X86_64_DTPMOD64 16 +#define R_X86_64_DTPOFF64 17 + +#define VM_EXEC 0x00000004 + +#define SHN_XINDEX 0xffff +#define MAX_SYM_NAME 64 + +struct task_state { + struct task_event event; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 4096); + __type(key, u32); + __type(value, struct task_state); +} task_states SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1024 * 1024); +} rb SEC(".maps"); + +struct elf_symtab { + u32 shndx; + u32 symtab_cnt; + u64 symtab_off; + u64 strtab_off; +}; + +struct elf_relasec { + u32 shndx; + u32 rela_cnt; + u64 rela_off; +}; + +struct elf { + u16 type; /* ET_EXEC or ET_DYN */ + u64 shoff; /* section headers list offset */ + u32 shnum; /* number of sections */ + + struct elf_symtab symtab; + + struct elf_symtab dynsym; + struct elf_relasec rela_dyn; /* .rela.dyn section info */ +}; + +struct scratch { + struct elf elf; + + struct elf64_hdr ehdr; + struct elf64_shdr shdr; + struct elf64_shdr strtab_shdr; + + struct elf64_sym sym; + struct elf64_rela rela; + char sym_name[MAX_SYM_NAME]; +}; + +static int zero = 0; + +char tls_var_name[64]; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct scratch); +} scratch_map SEC(".maps"); + +/* + * Frame pointer-based user stack unwinding. + * + * On x86_64 with frame pointers enabled (-fno-omit-frame-pointer): + * [rbp + 0] = saved rbp (previous frame pointer) + * [rbp + 8] = return address + * + * We walk the chain of frame pointers to collect return addresses. + */ +static int unwind_user_stack(struct task_struct *task, u64 *stack, int max_depth) +{ + struct pt_regs *regs; + struct frame { + u64 next_fp; /* saved frame pointer (rbp) */ + u64 ret_addr; /* return address */ + } frame; + u64 fp; + unsigned i = 0; + + regs = bpf_core_cast((void *)bpf_task_pt_regs(task), struct pt_regs); + if (!(regs->cs & 3)) + return 0; /* not in user space mode */ + + stack[0] = regs->ip; + + fp = regs->bp; + bpf_for(i, 1, MAX_STACK_DEPTH) { + /* read the frame, [fp] = next_fp, [fp+8] = ret_addr */ + if (bpf_copy_from_user_task(&frame, sizeof(frame), (void *)fp, task, 0)) + break; + + barrier_var(i); + if (i < MAX_STACK_DEPTH) + stack[i] = frame.ret_addr; + + fp = frame.next_fp; + } + + return i * sizeof(u64); +} + +static int parse_elf(struct bpf_dynptr *fdptr, struct elf *elf, struct scratch *s) +{ + int err, i; + + /* ELF header */ + err = bpf_dynptr_read(&s->ehdr, sizeof(s->ehdr), fdptr, 0, 0); + if (err) { + bpf_printk(" [ELF] Failed to read ELF header: %d", err); + return err; + } + + /* Verify ELF magic */ + if (s->ehdr.e_ident[0] != ELFMAG0 || s->ehdr.e_ident[1] != ELFMAG1 || + s->ehdr.e_ident[2] != ELFMAG2 || s->ehdr.e_ident[3] != ELFMAG3) { + bpf_printk(" [ELF] Not an ELF file"); + return -EPROTO; + } + + /* Only support 64-bit ELF for now */ + if (s->ehdr.e_ident[EI_CLASS] != ELFCLASS64) { + bpf_printk(" [ELF] Not 64-bit ELF"); + return -EOPNOTSUPP; + } + + elf->type = s->ehdr.e_type; + elf->shoff = s->ehdr.e_shoff; + elf->shnum = s->ehdr.e_shnum; + + //bpf_printk(" [ELF] Section headers: off=%llu, num=%u", elf->shoff, elf->shnum); + if (elf->shnum == 0 || elf->shnum >= SHN_XINDEX) + return -EOPNOTSUPP; + + elf->symtab.shndx = 0; + elf->dynsym.shndx = 0; + elf->rela_dyn.shndx = 0; + + bpf_for(i, 1, elf->shnum) { + u64 symtab_off, symtab_size, strtab_shdr_off; + u32 symtab_entsize, strtab_idx; + u64 shdr_off = elf->shoff + i * sizeof(struct elf64_shdr); + + err = bpf_dynptr_read(&s->shdr, sizeof(s->shdr), fdptr, shdr_off, 0); + if (err) { + bpf_printk(" [ELF] Failed to read shdr[%d]: %d", i, err); + break; + } + + if (s->shdr.sh_type == SHT_RELA) { + /* Handle .rela.dyn section (SHT_RELA linked to .dynsym) */ + u32 rela_entsize = s->shdr.sh_entsize ?: sizeof(struct elf64_rela); + + if (elf->rela_dyn.shndx == 0) { + /* TODO: validate that shdr.sh_link points to SHT_DYNSYM section */ + elf->rela_dyn.shndx = i; + elf->rela_dyn.rela_off = s->shdr.sh_offset; + elf->rela_dyn.rela_cnt = s->shdr.sh_size / rela_entsize; + } + } else if (s->shdr.sh_type == SHT_SYMTAB || s->shdr.sh_type == SHT_DYNSYM) { + symtab_off = s->shdr.sh_offset; + symtab_size = s->shdr.sh_size; + symtab_entsize = s->shdr.sh_entsize ?: sizeof(struct elf64_sym); + + /* sh_link points to the associated string table */ + strtab_idx = s->shdr.sh_link; + strtab_shdr_off = elf->shoff + strtab_idx * sizeof(struct elf64_shdr); + err = bpf_dynptr_read(&s->strtab_shdr, sizeof(s->strtab_shdr), fdptr, strtab_shdr_off, 0); + if (err) { + bpf_printk(" [ELF] Failed to read strtab shdr[%d]: %d", strtab_idx, err); + return err; + } + + //bpf_printk(" [ELF] Found %s: off=%llu, cnt=%llu", + // s->shdr.sh_type == SHT_SYMTAB ? ".symtab" : ".dynsym", + // symtab_off, symtab_size / symtab_entsize); + + if (s->shdr.sh_type == SHT_SYMTAB) { + elf->symtab.shndx = i; + elf->symtab.symtab_off = symtab_off; + elf->symtab.symtab_cnt = symtab_size / symtab_entsize; + elf->symtab.strtab_off = s->strtab_shdr.sh_offset; + } else { + elf->dynsym.shndx = i; + elf->dynsym.symtab_off = symtab_off; + elf->dynsym.symtab_cnt = symtab_size / symtab_entsize; + elf->dynsym.strtab_off = s->strtab_shdr.sh_offset; + } + } + + if (elf->dynsym.shndx && elf->symtab.shndx && elf->rela_dyn.shndx) + break; + } + + return 0; +} + +static const char *sym_type_str(u8 type) +{ + switch (type) { + case STT_NOTYPE: return "NOTYPE"; + case STT_OBJECT: return "OBJECT"; + case STT_FUNC: return "FUNC"; + case STT_SECTION: return "SECTION"; + case STT_FILE: return "FILE"; + case STT_COMMON: return "COMMON"; + case STT_TLS: return "TLS"; + default: return "UNKNOWN"; + } +} + +static int find_symtab_sym(struct bpf_dynptr *fdptr, struct elf_symtab *symtab, + const char *sym_name, int sym_type, + struct scratch *s) +{ + int err, i, j; + + if (!symtab->shndx) + return -ENOENT; + + bpf_for(i, 1, symtab->symtab_cnt) { + u64 sym_off = symtab->symtab_off + i * sizeof(struct elf64_sym); + u8 type; + bool match; + + err = bpf_dynptr_read(&s->sym, sizeof(s->sym), fdptr, sym_off, 0); + if (err) + return err; + + /* skip anonymous or external symbols */ + if (s->sym.st_name == 0 || s->sym.st_shndx == 0) + continue; + + type = ELF64_ST_TYPE(s->sym.st_info); + if (sym_type && type != sym_type) + continue; + + err = bpf_dynptr_read(s->sym_name, sizeof(s->sym_name), fdptr, + symtab->strtab_off + s->sym.st_name, 0); + if (err) + return err; + s->sym_name[sizeof(s->sym_name) - 1] = '\0'; + + if (bpf_strcmp(s->sym_name, sym_name) != 0) + continue; + + return i; + } + + return -ENOENT; +} + +static int find_sym(struct bpf_dynptr *fdptr, struct elf *elf, const char *sym_name, int sym_type, struct scratch *s) +{ + int idx; + + idx = find_symtab_sym(fdptr, &elf->dynsym, sym_name, sym_type, s); + if (idx > 0) + return idx; + + return find_symtab_sym(fdptr, &elf->symtab, sym_name, sym_type, s); +} + +/* Iterate symbols from a symbol table and print all symbols. */ +static void print_symtab(struct bpf_dynptr *fdptr, struct elf_symtab *symtab, + const char *name, struct scratch *s) +{ + int err, i; + + if (!symtab->shndx) + return; + + bpf_printk(" [ELF] Parsing %s (%u symbols):", name, symtab->symtab_cnt); + bpf_for(i, 1, symtab->symtab_cnt) { + u64 sym_off = symtab->symtab_off + i * sizeof(struct elf64_sym); + u8 sym_type; + + err = bpf_dynptr_read(&s->sym, sizeof(s->sym), fdptr, sym_off, 0); + if (err) + break; + + if (s->sym.st_name == 0) + continue; + + /* Skip undefined symbols (external references) */ + if (s->sym.st_shndx == 0) + continue; + + err = bpf_dynptr_read(s->sym_name, sizeof(s->sym_name), fdptr, + symtab->strtab_off + s->sym.st_name, 0); + if (err) { + bpf_printk(" [SYM] Failed to read symbol #%d: %d\n", i, err); + break; + } + s->sym_name[sizeof(s->sym_name) - 1] = '\0'; + + sym_type = ELF64_ST_TYPE(s->sym.st_info); + + bpf_printk(" [SYM] 0x%llx %s %s", s->sym.st_value, sym_type_str(sym_type), s->sym_name); + } +} + +static void print_symbols(struct bpf_dynptr *fdptr, struct elf *elf, struct scratch *s) +{ + print_symtab(fdptr, &elf->symtab, ".symtab", s); + print_symtab(fdptr, &elf->dynsym, ".dynsym", s); +} + +int MINUS_ONE = -1; + +/* + * On x86_64, TLS is accessed via the FS segment register. + * The FS base points to the Thread Control Block (TCB). + * + * TCB layout (glibc): + * offset 0: void *tcb - self pointer + * offset 8: dtv_t *dtv - Dynamic Thread Vector + * offset 16: void *self - thread descriptor + * ... + * + * DTV layout: + * dtv[0].counter = generation/size + * dtv[1].pointer.val = TLS block for module 1 (main executable) + * dtv[2].pointer.val = TLS block for module 2 (first shared lib) + * ... + * + * For Initial Exec (IE) model (main executable TLS): + * TLS vars are accessed as negative offsets from TP (thread pointer) + * TP = fsbase (on x86_64 with glibc) + * + * For General Dynamic (GD) model (shared library TLS): + * __tls_get_addr() is called with {module_id, offset} + * Returns: dtv[module_id].pointer.val + offset + */ + +/* https://github.com/bminor/glibc/blob/master/sysdeps/generic/dl-dtv.h#L29 */ +typedef union dtv { + size_t counter; + struct dtv_pointer { + void *val; + void *to_free; + } pointer; +} dtv_t; + +/* Partial definition for tcbhead_t + * https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 + */ +typedef struct { + void *tcb; + dtv_t *dtv; +} tcbhead_t; + +struct tls_index { + long mod_id; + long offset; +}; + +/* + * Find the GOT entry offset for a TLS symbol by scanning .rela.dyn for + * R_X86_64_DTPMOD64 relocations matching the symbol index. + * + * At runtime, this GOT entry contains {module_id, tls_offset} which can be + * read from the loaded library's memory. + * + * Returns: GOT virt offset on success, negative error on failure + */ +static long find_tls_got_entry(struct bpf_dynptr *fdptr, struct elf *elf, + u32 sym_idx, struct scratch *s) +{ + int err, i; + + if (!elf->rela_dyn.shndx) + return -ENOENT; + + bpf_for(i, 0, elf->rela_dyn.rela_cnt) { + u64 rela_off = elf->rela_dyn.rela_off + i * sizeof(struct elf64_rela); + u32 rela_sym, rela_type; + + err = bpf_dynptr_read(&s->rela, sizeof(s->rela), fdptr, rela_off, 0); + if (err) + return err; + + rela_type = ELF64_R_TYPE(s->rela.r_info); + if (rela_type != R_X86_64_DTPMOD64) + continue; + + rela_sym = ELF64_R_SYM(s->rela.r_info); + if (sym_idx && rela_sym != sym_idx) + continue; + + /* r_offset is the GOT entry offset */ + return s->rela.r_offset; + } + + return -ENOENT; +} + +/* Read tls_index {module_id, offset} from loaded library memory. */ +static inline int read_got_entry(struct task_struct *task, struct vm_area_struct *vma, const char *vma_name, + struct bpf_dynptr *fdptr, struct scratch *s, long got_off, struct tls_index *tls_index) +{ + /* TODO: this should translate file offset to virtoffset by looking at section header */ + long got_addr = vma->vm_start - vma->vm_pgoff * __PAGE_SIZE + got_off; + + int err = bpf_copy_from_user_task(tls_index, sizeof(*tls_index), (void *)got_addr, task, 0); + if (err) { + bpf_printk("[TLS] Failed to read GOT entry for '%s' at %px: %d", vma_name, got_addr, err); + return -EPROTO; + } + + bpf_printk("[TLS] GOT TLS index for '%s' at %px: module_id=%ld, offset=%ld", + vma_name, got_addr, tls_index->mod_id, tls_index->offset); + + return 0; +} + +/* Figure out absolute address of a TLS variable identified by module ID + offset */ +static long find_tls_addr(struct task_struct *task, long module_id, long offset) +{ + long dtv_ptr, tls_block; + int err; + + long fsbase = task->thread.fsbase; + + /* Read DTV pointer from TCB (offset 8) */ + err = bpf_copy_from_user_task(&dtv_ptr, sizeof(dtv_ptr), + (void *)(fsbase + offsetof(tcbhead_t, dtv)), task, 0); + if (err) { + bpf_printk("[TLS] Failed to read DTV pointer: %d", err); + return err; + } + + //bpf_printk("[TLS] fsbase=%px, dtv=%px", fsbase, dtv_ptr); + + /* + * Read TLS block pointer from DTV[module_id]. + * Each DTV entry is 16 bytes (see dtv_t above). + */ + err = bpf_copy_from_user_task(&tls_block, sizeof(tls_block), + (void *)(dtv_ptr + module_id * sizeof(dtv_t)), task, 0); + if (err) { + bpf_printk("[TLS] Failed to read DTV[%ld]: %d", module_id, err); + return err; + } + + //bpf_printk("[TLS] dtv[%ld].val = %px", module_id, tls_block); + + /* Special value -1 means TLS block not yet allocated */ + if (tls_block == (u64)-1) { + bpf_printk("[TLS] TLS block not allocated for module %ld", module_id); + return -ENOENT; + } + + return tls_block + offset; +} + +static long find_tls_var(struct task_struct *task, struct vm_area_struct *vma, const char *vma_name, + struct bpf_dynptr *fdptr, const char *tls_var_name, struct scratch *s) +{ + struct tls_index tls_index; + int err; + + int sym_idx = find_symtab_sym(fdptr, &s->elf.dynsym, tls_var_name, STT_TLS, s); + if (sym_idx > 0) { + bpf_printk("[TLS] Found TLS symbol '%s' in .dynsym for '%s': st_value=%llx sz=%llu shndx=%u", + s->sym_name, vma_name, + s->sym.st_value, s->sym.st_size, s->sym.st_shndx); + + long got_off = find_tls_got_entry(fdptr, &s->elf, sym_idx, s); + //bpf_printk("[TLS] GOT entry at virt offset 0x%llx", got_off); + if (got_off < 0) { + bpf_printk("[TLS] No GOT entry found for symbol #%d: %ld", sym_idx, got_off); + return -EPROTO; + } + + /* Read tls_index {module_id, offset} from loaded library memory. */ + err = read_got_entry(task, vma, vma_name, fdptr, s, got_off, &tls_index); + if (err) { + bpf_printk("[TLS] Failed reading GOT entry symbol #%d at %px: %ld", + sym_idx, got_off); + return err; + } + + return find_tls_addr(task, tls_index.mod_id, tls_index.offset); + } + + /* local TLS variable not in .dynsym */ + sym_idx = find_symtab_sym(fdptr, &s->elf.symtab, tls_var_name, STT_TLS, s); + if (sym_idx > 0) { + bpf_printk("[TLS] Found TLS symbol '%s' in .symtab for '%s': st_value=%llx sz=%llu shndx=%u", + s->sym_name, vma_name, + s->sym.st_value, s->sym.st_size, s->sym.st_shndx); + + if (s->elf.type == ET_EXEC) { + /* for local exec TLS model, module ID is 1 */ + return find_tls_addr(task, 1, s->sym.st_value); + } else { + /* + * For local symbol in shared lib, try to find module ID using *ANY* + * DTPMOD64 relo, and then assume that st_value gives us valid offset + * within module's block. + */ + long got_off = find_tls_got_entry(fdptr, &s->elf, 0, s); + if (got_off < 0) { + bpf_printk("[TLS] No GOT entry (any at all) found in '%s': %ld", vma_name, got_off); + return -EOPNOTSUPP; + } + + /* Read tls_index {module_id, offset} from loaded library memory. */ + err = read_got_entry(task, vma, vma_name, fdptr, s, got_off, &tls_index); + if (err) + return err; + + return find_tls_addr(task, tls_index.mod_id, s->sym.st_value); + } + + return -EOPNOTSUPP; + } + + return -ENOENT; +} + +/* + * Iterate VMAs of the current task, find executable file-backed VMAs, + * and parse their ELF symbols. + */ +static int enumerate_vmas(struct task_struct *task, struct task_event *event) +{ + struct vm_area_struct *vma; + struct scratch *s; + u64 last_ino = MINUS_ONE; + int err; + + s = bpf_map_lookup_elem(&scratch_map, &zero); + if (!s) + return 0; /* can't happen */ + + //bpf_printk("[VMA] Enumerating VMAs for task %d (%s)", task->pid, task->comm); + + bpf_for_each(task_vma, vma, task, 0) { + struct bpf_dynptr fdptr; + struct inode *inode; + struct file *file; + + if (!(vma->vm_flags & VM_EXEC)) + continue; + + file = vma->vm_file; + if (!file) + continue; + inode = file->f_inode; + if (!inode) + continue; + + /* + * This is a cheap and effective way to minimize reparsing of the same ELF, but + * it doesn't guarantee that each unique inode will be processed just once. This + * is acceptable for an example, though. + */ + u64 ino = inode->i_ino; + if (last_ino == ino) + continue; + + const char *vma_name = (const char *)file->f_path.dentry->d_name.name; + //bpf_printk("[VMA] Executable file-backed VMA: 0x%lx-0x%lx (ino=%llu, name=%s)", + // vma->vm_start, vma->vm_end, ino, vma_name); + + err = bpf_dynptr_from_file(file, 0, &fdptr); + if (err) { + bpf_printk(" [ELF] Failed to create dynptr for (ino=%llu, name=%s): %d", ino, vma_name, err); + goto next; + } + + err = parse_elf(&fdptr, &s->elf, s); + if (err) + goto next; + + if (task->pid == task->tgid) { + int sym_idx = find_sym(&fdptr, &s->elf, "Py_Version", STT_OBJECT, s); + if (sym_idx > 0) { + long py_ver_addr = vma->vm_start - vma->vm_pgoff * __PAGE_SIZE + s->sym.st_value; + bpf_printk("[PY] Found 'Py_Version' global variable for PID %d (%s) in '%s' at %px", + task->pid, task->comm, vma_name, py_ver_addr); + + __u32 py_ver; + err = bpf_copy_from_user_task(&py_ver, sizeof(py_ver), (void *)py_ver_addr, task, 0); + if (err) { + bpf_printk("[PY] Failed to read Py_Version at %px for '%s': %d", + py_ver_addr, vma_name, err); + } else { + bpf_printk("[PY] PID %d (%s) is running Python v%u.%u.%u!", + task->pid, task->comm, + (u8)(py_ver >> 24), (u8)(py_ver >> 16), (u8)(py_ver >> 8), py_ver); + event->py_ver = py_ver; + } + } + } + + //print_symbols(&fdptr, &s->elf, s); + + long tls_addr = find_tls_var(task, vma, vma_name, &fdptr, tls_var_name, s); + if (tls_addr == -ENOENT) + goto next; + if (tls_addr < 0) { + bpf_printk("[TLS] Failed to figure TLS address of '%s' variable: %ld", tls_var_name, tls_addr); + goto next; + } + + /* Read the actual TLS variable */ + int val; + err = bpf_copy_from_user_task(&val, sizeof(val), (void *)tls_addr, task, 0); + if (err) { + bpf_printk("[TLS] Failed to read TLS var at %px: %d", tls_addr, err); + goto next; + } + + bpf_printk("[TLS] TLS variable '%s' found in '%s' (TID %d '%s') = %d", + tls_var_name, vma_name, task->pid, task->comm, val); + + event->has_tls = true; + event->tls_value = val; + +next: + bpf_dynptr_file_discard(&fdptr); + + last_ino = ino; + } + + return 0; +} + +static int task_work_cb(struct bpf_map *map, void *key, void *value) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct task_state *state = value; + struct task_event *event = &state->event; + u32 tid = task->pid; + + if (event->tid != task->pid) { + bpf_printk("MISMATCHED PID %d != expected %d", task->pid, event->tid); + goto cleanup; + } + + event->py_ver = 0; + event->has_tls = false; + event->ustack_sz = unwind_user_stack(task, event->ustack, MAX_STACK_DEPTH); + + enumerate_vmas(task, event); + + bpf_ringbuf_output(&rb, event, sizeof(*event), 0); +cleanup: + bpf_map_delete_elem(&task_states, key); + return 0; +} + +/* + * THIS DOESN'T CURRENTLY WORK: + * static struct task_state empty_state; + * + * Verifier will complain: + * bpf_task_work cannot be accessed directly by load/store + */ +static char empty_state[sizeof(struct task_state)]; + +SEC("iter.s/task") +int snoop_tasks(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + struct task_state *state; + struct task_event *event; + u32 tid; + int err; + + if (!task) + return 0; + + tid = task->pid; + + err = bpf_map_update_elem(&task_states, &tid, &empty_state, BPF_NOEXIST); + if (err) { + bpf_printk("Unexpected error adding task state for %d (%s): %d", tid, task->comm, err); + return 0; + } + state = bpf_map_lookup_elem(&task_states, &tid); + if (!state) { + bpf_printk("Unexpected error fetching task state for %d (%s): %d", tid, task->comm, err); + return 0; + } + + event = &state->event; + event->pid = task->tgid; + event->tid = task->pid; + bpf_probe_read_kernel_str(event->comm, TASK_COMM_LEN, task->comm); + + event->kstack_sz = bpf_get_task_stack(task, event->kstack, sizeof(event->kstack), 0); + + err = bpf_task_work_schedule_signal_impl(task, &state->tw, &task_states, task_work_cb, NULL); + if (err) { + bpf_printk("Unexpected error scheduling task work %d (%s): %d", tid, task->comm, err); + bpf_map_delete_elem(&task_states, &tid); + return 0; + } + + return 0; +} diff --git a/examples/c/snooper.c b/examples/c/snooper.c new file mode 100644 index 00000000..060cc725 --- /dev/null +++ b/examples/c/snooper.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2025 Meta Platforms, Inc. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "snooper.skel.h" +#include "snooper.h" +#include "blazesym.h" + +static struct blaze_symbolizer *symbolizer; +static volatile bool exiting = false; +struct snooper_bpf *skel; + +static void sig_handler(int sig) +{ + exiting = true; +} + +static void print_frame(const char *name, uintptr_t input_addr, uintptr_t addr, + uint64_t offset, const blaze_symbolize_code_info* code_info) +{ + if (input_addr != 0) { + printf(" %016lx: %s @ 0x%lx+0x%lx", input_addr, name, addr, offset); + if (code_info != NULL && code_info->dir != NULL && code_info->file != NULL) { + printf(" %s/%s:%u\n", code_info->dir, code_info->file, code_info->line); + } else if (code_info != NULL && code_info->file != NULL) { + printf(" %s:%u\n", code_info->file, code_info->line); + } else { + printf("\n"); + } + } else { + printf(" %16s %s", "", name); + if (code_info != NULL && code_info->dir != NULL && code_info->file != NULL) { + printf("@ %s/%s:%u [inlined]\n", code_info->dir, code_info->file, code_info->line); + } else if (code_info != NULL && code_info->file != NULL) { + printf("@ %s:%u [inlined]\n", code_info->file, code_info->line); + } else { + printf("[inlined]\n"); + } + } +} + +static void show_stack_trace(__u64 *stack, int stack_sz, pid_t pid) +{ + const struct blaze_symbolize_inlined_fn* inlined; + const struct blaze_syms *syms; + const struct blaze_sym *sym; + int i, j; + + assert(sizeof(uintptr_t) == sizeof(uint64_t)); + + if (pid) { + struct blaze_symbolize_src_process src = { + .type_size = sizeof(src), + .pid = pid, + }; + + syms = blaze_symbolize_process_abs_addrs(symbolizer, &src, + (const uintptr_t *)stack, stack_sz); + } else { + struct blaze_symbolize_src_kernel src = { + .type_size = sizeof(src), + }; + + syms = blaze_symbolize_kernel_abs_addrs(symbolizer, &src, + (const uintptr_t *)stack, stack_sz); + } + + if (!syms) { + printf(" failed to symbolize addresses: %s\n", blaze_err_str(blaze_err_last())); + return; + } + + for (i = 0; i < stack_sz; i++) { + if (!syms || syms->cnt <= i || syms->syms[i].name == NULL) { + printf(" %016llx: \n", stack[i]); + continue; + } + + sym = &syms->syms[i]; + print_frame(sym->name, stack[i], sym->addr, sym->offset, &sym->code_info); + + for (j = 0; j < sym->inlined_cnt; j++) { + inlined = &sym->inlined[j]; + print_frame(inlined->name, 0, 0, 0, &inlined->code_info); + } + } + + blaze_syms_free(syms); +} + +/* Ringbuf callback for task events */ +static int handle_event(void *ctx, void *data, size_t size) +{ + struct task_event *event = data; + + printf("Task: %s (PID=%d, TID=%d)\n", event->comm, event->pid, event->tid); + + if (event->py_ver) { + printf(" Running Python v%u.%u.%u!\n", + (__u8)(event->py_ver >> 24), + (__u8)(event->py_ver >> 16), + (__u8)(event->py_ver >> 8)); + } + + if (event->has_tls) + printf(" TLS: %s = %d\n", skel->bss->tls_var_name, (int)event->tls_value); + + /* Show kernel stack trace */ + if (event->kstack_sz > 0) { + printf(" Kernel stack:\n"); + show_stack_trace(event->kstack, event->kstack_sz / sizeof(__u64), 0); + } else if (event->kstack_sz < 0) { + printf(" Kernel stack error: %d\n", event->kstack_sz); + } else { + printf(" No kernel stack\n"); + } + + /* Show user stack trace */ + if (event->ustack_sz > 0) { + printf(" User stack:\n"); + show_stack_trace(event->ustack, event->ustack_sz / sizeof(__u64), event->pid); + } else if (event->ustack_sz < 0) { + printf(" User stack error: %d\n", event->ustack_sz); + } else { + printf(" No user stack\n"); + } + + printf("\n"); + return 0; +} + +static void show_help(const char *progname) +{ + printf("Usage: %s \n", progname); + printf(" PID Process ID to filter tasks (required)\n"); +} + +int main(int argc, char **argv) +{ + struct ring_buffer *rb = NULL; + LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + pid_t pid_filter = 0; + int iter_fd = -1; + int err = 0; + char dummy; + + if (argc < 3) { + show_help(argv[0]); + return 1; + } + + errno = 0; + pid_filter = (pid_t)strtol(argv[1], NULL, 10); + err = -errno; + if (err != 0 || pid_filter <= 0) { + fprintf(stderr, "Failed to parse PID '%s'\n", argv[1]); + show_help(argv[0]); + return 1; + } + + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + skel = snooper_bpf__open_and_load(); + if (!skel) { + fprintf(stderr, "Failed to open and load BPF skeleton\n"); + err = -1; + goto cleanup; + } + + snprintf(skel->bss->tls_var_name, sizeof(skel->bss->tls_var_name), + "%s", argv[2]); + + symbolizer = blaze_symbolizer_new(); + if (!symbolizer) { + fprintf(stderr, "Failed to create symbolizer\n"); + err = -1; + goto cleanup; + } + + rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL); + if (!rb) { + fprintf(stderr, "Failed to create ring buffer\n"); + err = -1; + goto cleanup; + } + + memset(&linfo, 0, sizeof(linfo)); + linfo.task.pid = pid_filter; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + skel->links.snoop_tasks = bpf_program__attach_iter(skel->progs.snoop_tasks, &opts); + if (!skel->links.snoop_tasks) { + err = -errno; + fprintf(stderr, "Failed to attach BPF iterator\n"); + goto cleanup; + } + + iter_fd = bpf_iter_create(bpf_link__fd(skel->links.snoop_tasks)); + if (iter_fd < 0) { + err = -errno; + fprintf(stderr, "Failed to create iterator\n"); + goto cleanup; + } + + printf("Snooping on tasks for PID %d...\n\n", pid_filter); + + /* trigger task iterator program */ + while (read(iter_fd, &dummy, sizeof(dummy)) > 0) { + /* nothing */ + } + + while (!exiting) { + err = ring_buffer__poll(rb, 100 /* timeout */); + if (err < 0 && err != -EINTR) { + fprintf(stderr, "Error polling ring buffer: %d\n", err); + break; + } + if (err == 0) + break; + } + +cleanup: + if (iter_fd >= 0) + close(iter_fd); + ring_buffer__free(rb); + snooper_bpf__destroy(skel); + blaze_symbolizer_free(symbolizer); + + return err < 0 ? -err : 0; +} diff --git a/examples/c/snooper.h b/examples/c/snooper.h new file mode 100644 index 00000000..93a34b8f --- /dev/null +++ b/examples/c/snooper.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2025 Meta Platforms, Inc. */ +#ifndef __SNOOPER_H_ +#define __SNOOPER_H_ + +#ifndef TASK_COMM_LEN +#define TASK_COMM_LEN 16 +#endif + +#ifndef MAX_STACK_DEPTH +#define MAX_STACK_DEPTH 128 +#endif + +typedef __u64 stack_trace_t[MAX_STACK_DEPTH]; + +struct task_event { + __u32 pid; + __u32 tid; + char comm[TASK_COMM_LEN]; + __s32 kstack_sz; + __s32 ustack_sz; + stack_trace_t kstack; + stack_trace_t ustack; + bool has_tls; + long tls_value; + __u32 py_ver; +}; + +#endif /* __SNOOPER_H_ */