diff --git a/support/ebpf/extmaps.h b/support/ebpf/extmaps.h index 8b3676476..538265766 100644 --- a/support/ebpf/extmaps.h +++ b/support/ebpf/extmaps.h @@ -16,13 +16,7 @@ extern struct interpreter_offsets_t interpreter_offsets; extern struct trace_events_t trace_events; extern struct go_labels_procs_t go_labels_procs; -#if defined(TESTING_COREDUMP) - -// References to maps in alphabetical order that -// are needed only for testing. - -extern struct apm_int_procs_t apm_int_procs; -extern struct beam_procs_t beam_procs; +// These are needed by both native and hybrid python unwinder. extern struct exe_id_to_8_stack_deltas_t exe_id_to_8_stack_deltas; extern struct exe_id_to_9_stack_deltas_t exe_id_to_9_stack_deltas; extern struct exe_id_to_10_stack_deltas_t exe_id_to_10_stack_deltas; @@ -39,14 +33,21 @@ extern struct exe_id_to_20_stack_deltas_t exe_id_to_20_stack_deltas; extern struct exe_id_to_21_stack_deltas_t exe_id_to_21_stack_deltas; extern struct exe_id_to_22_stack_deltas_t exe_id_to_22_stack_deltas; extern struct exe_id_to_23_stack_deltas_t exe_id_to_23_stack_deltas; +extern struct stack_delta_page_to_info_t stack_delta_page_to_info; +extern struct unwind_info_array_t unwind_info_array; + +#if defined(TESTING_COREDUMP) + +// References to maps in alphabetical order that +// are needed only for testing. +extern struct apm_int_procs_t apm_int_procs; +extern struct beam_procs_t beam_procs; extern struct hotspot_procs_t hotspot_procs; extern struct dotnet_procs_t dotnet_procs; extern struct perl_procs_t perl_procs; extern struct php_procs_t php_procs; extern struct py_procs_t py_procs; extern struct ruby_procs_t ruby_procs; -extern struct stack_delta_page_to_info_t stack_delta_page_to_info; -extern struct unwind_info_array_t unwind_info_array; extern struct v8_procs_t v8_procs; #endif // TESTING_COREDUMP diff --git a/support/ebpf/native_stack_trace.ebpf.c b/support/ebpf/native_stack_trace.ebpf.c index 5fd291634..2e95624df 100644 --- a/support/ebpf/native_stack_trace.ebpf.c +++ b/support/ebpf/native_stack_trace.ebpf.c @@ -61,10 +61,6 @@ STACK_DELTA_BUCKET(21); STACK_DELTA_BUCKET(22); STACK_DELTA_BUCKET(23); -// Unwind info value for invalid stack delta -#define STACK_DELTA_INVALID (STACK_DELTA_COMMAND_FLAG | UNWIND_COMMAND_INVALID) -#define STACK_DELTA_STOP (STACK_DELTA_COMMAND_FLAG | UNWIND_COMMAND_STOP) - // An array of unwind info contains the all the different UnwindInfo instances // needed system wide. Individual stack delta entries refer to this array. struct unwind_info_array_t { @@ -74,9 +70,6 @@ struct unwind_info_array_t { __uint(max_entries, UNWIND_INFO_MAX_ENTRIES); } unwind_info_array SEC(".maps"); -// The number of native frames to unwind per frame-unwinding eBPF program. -#define NATIVE_FRAMES_PER_PROGRAM 5 - // The decision whether to unwind native stacks or interpreter stacks is made by checking if a given // PC address falls into the "interpreter loop" of an interpreter. This map helps identify such // loops: The keys are those executable section IDs that contain interpreter loops, the values @@ -96,446 +89,7 @@ struct stack_delta_page_to_info_t { __uint(max_entries, 40000); } stack_delta_page_to_info SEC(".maps"); -// Record a native frame -static EBPF_INLINE ErrorCode -push_native(UnwindState *state, Trace *trace, u64 file, u64 line, bool return_address) -{ - const u8 ra_flag = return_address ? FRAME_FLAG_RETURN_ADDRESS : 0; - - u64 *data = push_frame(state, trace, FRAME_MARKER_NATIVE, ra_flag, line, 1); - if (!data) { - return ERR_STACK_LENGTH_EXCEEDED; - } - data[0] = file; - return ERR_OK; -} - -// A single step for the bsearch into the big_stack_deltas array. This is really a textbook bsearch -// step, built in a way to update the value of *lo and *hi. This function will be called repeatedly -// (since we cannot do loops). The return value signals whether the bsearch came to an end / found -// the right element or whether it needs to continue. -static EBPF_INLINE bool bsearch_step(void *inner_map, u32 *lo, u32 *hi, u16 page_offset) -{ - u32 pivot = (*lo + *hi) >> 1; - StackDelta *delta = bpf_map_lookup_elem(inner_map, &pivot); - if (!delta) { - *hi = 0; - return false; - } - if (page_offset >= delta->addrLow) { - *lo = pivot + 1; - } else { - *hi = pivot; - } - return *lo < *hi; -} - -// Get the outer map based on the number of stack delta entries. -static EBPF_INLINE void *get_stack_delta_map(int mapID) -{ - switch (mapID) { - case 8: return &exe_id_to_8_stack_deltas; - case 9: return &exe_id_to_9_stack_deltas; - case 10: return &exe_id_to_10_stack_deltas; - case 11: return &exe_id_to_11_stack_deltas; - case 12: return &exe_id_to_12_stack_deltas; - case 13: return &exe_id_to_13_stack_deltas; - case 14: return &exe_id_to_14_stack_deltas; - case 15: return &exe_id_to_15_stack_deltas; - case 16: return &exe_id_to_16_stack_deltas; - case 17: return &exe_id_to_17_stack_deltas; - case 18: return &exe_id_to_18_stack_deltas; - case 19: return &exe_id_to_19_stack_deltas; - case 20: return &exe_id_to_20_stack_deltas; - case 21: return &exe_id_to_21_stack_deltas; - case 22: return &exe_id_to_22_stack_deltas; - case 23: return &exe_id_to_23_stack_deltas; - default: return NULL; - } -} - -// Get the stack offset of the given instruction. -static EBPF_INLINE ErrorCode get_stack_delta(UnwindState *state, int *addrDiff, u32 *unwindInfo) -{ - u64 exe_id = state->text_section_id; - - // Look up the stack delta page information for this address. - StackDeltaPageKey key = {}; - key.fileID = state->text_section_id; - key.page = state->text_section_offset & ~STACK_DELTA_PAGE_MASK; - DEBUG_PRINT( - "Look up stack delta for %lx:%lx", - (unsigned long)state->text_section_id, - (unsigned long)state->text_section_offset); - StackDeltaPageInfo *info = bpf_map_lookup_elem(&stack_delta_page_to_info, &key); - if (!info) { - DEBUG_PRINT( - "Failure to look up stack delta page fileID %lx, page %lx", - (unsigned long)key.fileID, - (unsigned long)key.page); - state->error_metric = metricID_UnwindNativeErrLookupTextSection; - return ERR_NATIVE_LOOKUP_TEXT_SECTION; - } - - void *outer_map = get_stack_delta_map(info->mapID); - if (!outer_map) { - DEBUG_PRINT( - "Failure to look up outer map for text section %lx in mapID %d", - (unsigned long)exe_id, - (int)info->mapID); - state->error_metric = metricID_UnwindNativeErrLookupStackDeltaOuterMap; - return ERR_NATIVE_LOOKUP_STACK_DELTA_OUTER_MAP; - } - - void *inner_map = bpf_map_lookup_elem(outer_map, &exe_id); - if (!inner_map) { - DEBUG_PRINT("Failure to look up inner map for text section %lx", (unsigned long)exe_id); - state->error_metric = metricID_UnwindNativeErrLookupStackDeltaInnerMap; - return ERR_NATIVE_LOOKUP_STACK_DELTA_INNER_MAP; - } - - // Preinitialize the idx for the index to use for page without any deltas. - u32 idx = info->firstDelta; - u16 page_offset = state->text_section_offset & STACK_DELTA_PAGE_MASK; - if (info->numDeltas) { - // Page has deltas, so find the correct one to use using binary search. - u32 lo = info->firstDelta; - u32 hi = lo + info->numDeltas; - - DEBUG_PRINT( - "Intervals should be from %lu to %lu (mapID %d)", - (unsigned long)lo, - (unsigned long)hi, - (int)info->mapID); - - // Do the binary search, up to 16 iterations. Deltas are paged to 64kB pages. - // They can contain at most 64kB deltas even if everything is single byte opcodes. - int i; - for (i = 0; i < 16; i++) { - if (!bsearch_step(inner_map, &lo, &hi, page_offset)) { - break; - } - } - if (i >= 16 || hi == 0) { - DEBUG_PRINT("Failed bsearch in 16 steps. Corrupt data?"); - state->error_metric = metricID_UnwindNativeErrLookupIterations; - return ERR_NATIVE_EXCEEDED_DELTA_LOOKUP_ITERATIONS; - } - // After bsearch, 'hi' points to the first entry greater than the requested. - idx = hi; - } - - // The code above found the first entry with greater address than requested, - // so it needs to be decremented by one to get the entry with equal-or-less. - // This makes also the logic work cross-pages: if the first entry in within - // the page is too large, this actually gets the entry from the previous page. - idx--; - - StackDelta *delta = bpf_map_lookup_elem(inner_map, &idx); - if (!delta) { - state->error_metric = metricID_UnwindNativeErrLookupRange; - return ERR_NATIVE_LOOKUP_RANGE; - } - - DEBUG_PRINT( - "delta index %d, addrLow 0x%x, unwindInfo %d", idx, delta->addrLow, delta->unwindInfo); - - // Calculate PC delta from stack delta for merged delta comparison - int deltaOffset = (int)page_offset - (int)delta->addrLow; - if (idx < info->firstDelta) { - // PC is below the first delta of the corresponding page. This means that - // delta->addrLow contains address relative to one page before the page_offset. - // Fix up the deltaOffset with this difference of base pages. - deltaOffset += 1 << STACK_DELTA_PAGE_BITS; - } - - *addrDiff = deltaOffset; - *unwindInfo = delta->unwindInfo; - - if (delta->unwindInfo == STACK_DELTA_INVALID) { - state->error_metric = metricID_UnwindNativeErrStackDeltaInvalid; - return ERR_NATIVE_STACK_DELTA_INVALID; - } - if (delta->unwindInfo == STACK_DELTA_STOP) { - increment_metric(metricID_UnwindNativeStackDeltaStop); - } - - return ERR_OK; -} - -// unwind_calc_register calculates the given basic register expression of -// format "BASE_REG + param". -static EBPF_INLINE u64 unwind_calc_register(UnwindState *state, u8 baseReg, s32 param) -{ - return state->regs[baseReg % (sizeof(state->regs) / sizeof(state->regs[0]))] + param; -} - -#if defined(__x86_64__) - -// unwind_calc_register_with_deref calculates the expression as: -// - basic expression "BASE_REG + param" -// - expression with a dereference "*(BASE_REG + preDeref) + postDeref" -static EBPF_INLINE u64 -unwind_calc_register_with_deref(UnwindState *state, u8 baseReg, s32 param, bool deref) -{ - s32 preDeref = param, postDeref = 0; - - if (deref) { - // For expressions that dereference the base expression, the parameter is constructed - // of pre-dereference and post-derefence operands. Unpack those. - preDeref &= ~UNWIND_DEREF_MASK; - postDeref = (param & UNWIND_DEREF_MASK) * UNWIND_DEREF_MULTIPLIER; - } - - // Resolve the "BASE + param" before potential derereference - u64 addr = unwind_calc_register(state, baseReg, preDeref); - if (!deref) { - // All done: return "BASE + param" - return addr; - } - - // Dereference, and add the postDereference adder. - unsigned long val; - if (bpf_probe_read_user(&val, sizeof(val), (void *)addr)) { - DEBUG_PRINT("unwind failed to dereference address 0x%lx", (unsigned long)addr); - return 0; - } - // Return: "*(BASE + preDeref) + postDeref" - return val + postDeref; -} -#endif - -// Stack unwinding in the absence of frame pointers can be a bit involved, so -// this comment explains what the following code does. -// -// One begins unwinding a frame somewhere in the middle of execution. -// On x86_64, registers RIP (PC), RSP (SP), and RBP (FP) are available. -// -// This function resolves a "stack delta" command from from our internal maps. -// This stack delta refers to a rule on how to unwind the state. In the simple -// case it just provides SP delta and potentially offset from where to recover -// FP value. See unwind_calc_register[_with_deref]() on the expressions supported. -// -// The function sets the bool pointed to by the given `stop` pointer to `false` -// if the main ebpf unwinder should exit. This is the case if the current PC -// is marked with UNWIND_COMMAND_STOP which marks entry points (main function, -// thread spawn function, signal handlers, ...). -#if defined(__x86_64__) -static EBPF_INLINE ErrorCode unwind_one_frame(UnwindState *state, bool *stop) -{ - *stop = false; - - u32 unwindInfo = 0; - u64 rt_regs[18]; - int addrDiff = 0; - u64 cfa = 0; - - // The relevant executable is compiled with frame pointer omission, so - // stack deltas need to be retrieved from the relevant map. - ErrorCode error = get_stack_delta(state, &addrDiff, &unwindInfo); - if (error) { - return error; - } - - if (unwindInfo & STACK_DELTA_COMMAND_FLAG) { - switch (unwindInfo & ~STACK_DELTA_COMMAND_FLAG) { - case UNWIND_COMMAND_PLT: - // The toolchains routinely emit a fixed DWARF expression to unwind the full - // PLT table with one expression to reduce .eh_frame size. - // This is the hard coded implementation of this expression. For further details, - // see https://hal.inria.fr/hal-02297690/document, page 4. (DOI: 10.1145/3360572) - cfa = state->sp + 8 + ((((state->pc & 15) >= 11) ? 1 : 0) << 3); - DEBUG_PRINT("PLT, cfa=0x%lx", (unsigned long)cfa); - break; - case UNWIND_COMMAND_SIGNAL: - // The rt_sigframe is defined at: - // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/include/asm/sigframe.h?h=v6.4#n59 - // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/include/uapi/asm/sigcontext.h?h=v6.4#n238 - // offsetof(struct rt_sigframe, uc.uc_mcontext) = 40 - if (bpf_probe_read_user(&rt_regs, sizeof(rt_regs), (void *)(state->sp + 40))) { - goto err_native_pc_read; - } - state->rax = rt_regs[13]; - state->r9 = rt_regs[1]; - state->r11 = rt_regs[3]; - state->r13 = rt_regs[5]; - state->r15 = rt_regs[7]; - state->fp = rt_regs[10]; - state->sp = rt_regs[15]; - state->pc = rt_regs[16]; - - state->return_address = false; - DEBUG_PRINT("signal frame"); - goto frame_ok; - case UNWIND_COMMAND_STOP: *stop = true; return ERR_OK; - case UNWIND_COMMAND_FRAME_POINTER: - if (!unwinder_unwind_frame_pointer(state)) { - goto err_native_pc_read; - } - goto frame_ok; - default: return ERR_UNREACHABLE; - } - } else { - UnwindInfo *info = bpf_map_lookup_elem(&unwind_info_array, &unwindInfo); - if (!info) { - increment_metric(metricID_UnwindNativeErrBadUnwindInfoIndex); - return ERR_NATIVE_BAD_UNWIND_INFO_INDEX; - } - - s32 param = info->param; - if (info->mergeOpcode) { - DEBUG_PRINT("AddrDiff %d, merged delta %#02x", addrDiff, info->mergeOpcode); - if (addrDiff >= (info->mergeOpcode & ~MERGEOPCODE_NEGATIVE)) { - param += (info->mergeOpcode & MERGEOPCODE_NEGATIVE) ? -8 : 8; - DEBUG_PRINT("Merged delta match: cfaDelta=%d", unwindInfo); - } - } - - // Resolve the frame's CFA (previous PC is fixed to CFA) address, and - // the previous FP address if any. - state->cfa = cfa = unwind_calc_register_with_deref( - state, info->baseReg, param, (info->flags & UNWIND_FLAG_DEREF_CFA) != 0); - u64 fpa = unwind_calc_register(state, info->auxBaseReg, info->auxParam); - - if (fpa) { - bpf_probe_read_user(&state->fp, sizeof(state->fp), (void *)fpa); - } else if (info->baseReg == UNWIND_REG_FP) { - // FP used for recovery, but no new FP value received, clear FP - state->fp = 0; - } - } - - if (!cfa || bpf_probe_read_user(&state->pc, sizeof(state->pc), (void *)(cfa - 8))) { - err_native_pc_read: - increment_metric(metricID_UnwindNativeErrPCRead); - return ERR_NATIVE_PC_READ; - } - state->sp = cfa; - unwinder_mark_nonleaf_frame(state); -frame_ok: - increment_metric(metricID_UnwindNativeFrames); - return ERR_OK; -} -#elif defined(__aarch64__) -static EBPF_INLINE ErrorCode unwind_one_frame(struct UnwindState *state, bool *stop) -{ - *stop = false; - - u32 unwindInfo = 0; - int addrDiff = 0; - u64 rt_regs[34]; - - // The relevant executable is compiled with frame pointer omission, so - // stack deltas need to be retrieved from the relevant map. - ErrorCode error = get_stack_delta(state, &addrDiff, &unwindInfo); - if (error) { - return error; - } - - if (unwindInfo & STACK_DELTA_COMMAND_FLAG) { - switch (unwindInfo & ~STACK_DELTA_COMMAND_FLAG) { - case UNWIND_COMMAND_SIGNAL: - // On aarch64 the struct rt_sigframe is at: - // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/kernel/signal.c?h=v6.4#n39 - // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/include/uapi/asm/sigcontext.h?h=v6.4#n28 - // offsetof(struct rt_sigframe, uc.uc_mcontext.regs[0]) = 312 - // offsetof(struct rt_sigframe, uc) 128 + - // offsetof(struct ucontext, uc_mcontext) 176 + - // offsetof(struct sigcontext, regs[0]) 8 - if (bpf_probe_read_user(&rt_regs, sizeof(rt_regs), (void *)(state->sp + 312))) { - goto err_native_pc_read; - } - state->pc = normalize_pac_ptr(rt_regs[32]); - state->sp = rt_regs[31]; - state->fp = rt_regs[29]; - state->lr = normalize_pac_ptr(rt_regs[30]); - state->r20 = rt_regs[20]; - state->r22 = rt_regs[22]; - state->r28 = rt_regs[28]; - - state->return_address = false; - state->lr_invalid = false; - DEBUG_PRINT("signal frame"); - goto frame_ok; - case UNWIND_COMMAND_STOP: *stop = true; return ERR_OK; - case UNWIND_COMMAND_FRAME_POINTER: - if (!unwinder_unwind_frame_pointer(state)) { - goto err_native_pc_read; - } - goto frame_ok; - default: return ERR_UNREACHABLE; - } - } - - UnwindInfo *info = bpf_map_lookup_elem(&unwind_info_array, &unwindInfo); - if (!info) { - increment_metric(metricID_UnwindNativeErrBadUnwindInfoIndex); - DEBUG_PRINT("Giving up due to invalid unwind info array index"); - return ERR_NATIVE_BAD_UNWIND_INFO_INDEX; - } - - s32 param = info->param; - if (info->mergeOpcode) { - DEBUG_PRINT("AddrDiff %d, merged delta %#02x", addrDiff, info->mergeOpcode); - if (addrDiff >= (info->mergeOpcode & ~MERGEOPCODE_NEGATIVE)) { - param += (info->mergeOpcode & MERGEOPCODE_NEGATIVE) ? -8 : 8; - DEBUG_PRINT("Merged delta match: cfaDelta=%d", unwindInfo); - } - } - - // Resolve the frame CFA (previous PC is fixed to CFA) address - state->cfa = unwind_calc_register(state, info->baseReg, param); - - // Resolve Return Address, it is either the value of link register or - // stack address where RA is stored - u64 ra = unwind_calc_register(state, info->auxBaseReg, info->auxParam); - if (!ra) { - if (info->auxBaseReg == UNWIND_REG_LR) { - increment_metric(metricID_UnwindNativeLr0); - } else { - err_native_pc_read: - increment_metric(metricID_UnwindNativeErrPCRead); - } - // report failure to resolve RA and stop unwinding - DEBUG_PRINT("Giving up due to failure to resolve RA"); - return ERR_NATIVE_PC_READ; - } - - if (info->auxBaseReg == UNWIND_REG_LR) { - // Allow LR unwinding only if it's known to be valid: either because - // it's the topmost user-mode frame, or recovered by signal trampoline. - if (state->lr_invalid) { - increment_metric(metricID_UnwindNativeErrLrUnwindingMidTrace); - return ERR_NATIVE_LR_UNWINDING_MID_TRACE; - } - } else { - DEBUG_PRINT("RA: %016llX", (u64)ra); - - // read the value of RA from stack - int err; - u64 fpra[2]; - fpra[0] = state->fp; - if (info->flags & UNWIND_FLAG_FRAME) { - err = bpf_probe_read_user(fpra, sizeof(fpra), (void *)(ra - 8)); - } else { - err = bpf_probe_read_user(&fpra[1], sizeof(fpra[0]), (void *)ra); - } - if (err) { - goto err_native_pc_read; - } - state->fp = fpra[0]; - ra = fpra[1]; - } - state->pc = normalize_pac_ptr(ra); - state->sp = state->cfa; - unwinder_mark_nonleaf_frame(state); -frame_ok: - increment_metric(metricID_UnwindNativeFrames); - return ERR_OK; -} -#else - #error unsupported architecture -#endif +#include "native_stack_trace.h" // unwind_native is the tail call destination for PROG_UNWIND_NATIVE. static EBPF_INLINE int unwind_native(struct pt_regs *ctx) @@ -573,7 +127,7 @@ static EBPF_INLINE int unwind_native(struct pt_regs *ctx) // Unwind the native frame using stack deltas. Stop if no next frame. bool stop; - error = unwind_one_frame(&record->state, &stop); + error = unwind_one_frame(record, &stop); if (error || stop) { break; } diff --git a/support/ebpf/native_stack_trace.h b/support/ebpf/native_stack_trace.h new file mode 100644 index 000000000..a0430cfe2 --- /dev/null +++ b/support/ebpf/native_stack_trace.h @@ -0,0 +1,464 @@ +#ifndef OPTI_NATIVE_STACK_TRACE_H +#define OPTI_NATIVE_STACK_TRACE_H + +#include "bpfdefs.h" +#include "extmaps.h" +#include "tracemgmt.h" + +// Unwind info value for invalid stack delta +#define STACK_DELTA_INVALID (STACK_DELTA_COMMAND_FLAG | UNWIND_COMMAND_INVALID) +#define STACK_DELTA_STOP (STACK_DELTA_COMMAND_FLAG | UNWIND_COMMAND_STOP) + +// The number of native frames to unwind per frame-unwinding eBPF program. +#define NATIVE_FRAMES_PER_PROGRAM 5 + +// Record a native frame +static EBPF_INLINE ErrorCode +push_native(UnwindState *state, Trace *trace, u64 file, u64 line, bool return_address) +{ + const u8 ra_flag = return_address ? FRAME_FLAG_RETURN_ADDRESS : 0; + + u64 *data = push_frame(state, trace, FRAME_MARKER_NATIVE, ra_flag, line, 1); + if (!data) { + return ERR_STACK_LENGTH_EXCEEDED; + } + data[0] = file; + return ERR_OK; +} + +// A single step for the bsearch into the big_stack_deltas array. This is really a textbook bsearch +// step, built in a way to update the value of *lo and *hi. This function will be called repeatedly +// (since we cannot do loops). The return value signals whether the bsearch came to an end / found +// the right element or whether it needs to continue. +static EBPF_INLINE bool bsearch_step(void *inner_map, u32 *lo, u32 *hi, u16 page_offset) +{ + u32 pivot = (*lo + *hi) >> 1; + StackDelta *delta = bpf_map_lookup_elem(inner_map, &pivot); + if (!delta) { + *hi = 0; + return false; + } + if (page_offset >= delta->addrLow) { + *lo = pivot + 1; + } else { + *hi = pivot; + } + return *lo < *hi; +} + +// Get the outer map based on the number of stack delta entries. +static EBPF_INLINE void *get_stack_delta_map(int mapID) +{ + switch (mapID) { + case 8: return &exe_id_to_8_stack_deltas; + case 9: return &exe_id_to_9_stack_deltas; + case 10: return &exe_id_to_10_stack_deltas; + case 11: return &exe_id_to_11_stack_deltas; + case 12: return &exe_id_to_12_stack_deltas; + case 13: return &exe_id_to_13_stack_deltas; + case 14: return &exe_id_to_14_stack_deltas; + case 15: return &exe_id_to_15_stack_deltas; + case 16: return &exe_id_to_16_stack_deltas; + case 17: return &exe_id_to_17_stack_deltas; + case 18: return &exe_id_to_18_stack_deltas; + case 19: return &exe_id_to_19_stack_deltas; + case 20: return &exe_id_to_20_stack_deltas; + case 21: return &exe_id_to_21_stack_deltas; + case 22: return &exe_id_to_22_stack_deltas; + case 23: return &exe_id_to_23_stack_deltas; + default: return NULL; + } +} + +// Get the stack offset of the given instruction. +static EBPF_INLINE ErrorCode get_stack_delta(UnwindState *state, int *addrDiff, u32 *unwindInfo) +{ + u64 exe_id = state->text_section_id; + + // Look up the stack delta page information for this address. + StackDeltaPageKey key = {}; + key.fileID = state->text_section_id; + key.page = state->text_section_offset & ~STACK_DELTA_PAGE_MASK; + DEBUG_PRINT( + "Look up stack delta for %lx:%lx", + (unsigned long)state->text_section_id, + (unsigned long)state->text_section_offset); + StackDeltaPageInfo *info = bpf_map_lookup_elem(&stack_delta_page_to_info, &key); + if (!info) { + DEBUG_PRINT( + "Failure to look up stack delta page fileID %lx, page %lx", + (unsigned long)key.fileID, + (unsigned long)key.page); + state->error_metric = metricID_UnwindNativeErrLookupTextSection; + return ERR_NATIVE_LOOKUP_TEXT_SECTION; + } + + void *outer_map = get_stack_delta_map(info->mapID); + if (!outer_map) { + DEBUG_PRINT( + "Failure to look up outer map for text section %lx in mapID %d", + (unsigned long)exe_id, + (int)info->mapID); + state->error_metric = metricID_UnwindNativeErrLookupStackDeltaOuterMap; + return ERR_NATIVE_LOOKUP_STACK_DELTA_OUTER_MAP; + } + + void *inner_map = bpf_map_lookup_elem(outer_map, &exe_id); + if (!inner_map) { + DEBUG_PRINT("Failure to look up inner map for text section %lx", (unsigned long)exe_id); + state->error_metric = metricID_UnwindNativeErrLookupStackDeltaInnerMap; + return ERR_NATIVE_LOOKUP_STACK_DELTA_INNER_MAP; + } + + // Preinitialize the idx for the index to use for page without any deltas. + u32 idx = info->firstDelta; + u16 page_offset = state->text_section_offset & STACK_DELTA_PAGE_MASK; + if (info->numDeltas) { + // Page has deltas, so find the correct one to use using binary search. + u32 lo = info->firstDelta; + u32 hi = lo + info->numDeltas; + + DEBUG_PRINT( + "Intervals should be from %lu to %lu (mapID %d)", + (unsigned long)lo, + (unsigned long)hi, + (int)info->mapID); + + // Do the binary search, up to 16 iterations. Deltas are paged to 64kB pages. + // They can contain at most 64kB deltas even if everything is single byte opcodes. + int i; + for (i = 0; i < 16; i++) { + if (!bsearch_step(inner_map, &lo, &hi, page_offset)) { + break; + } + } + if (i >= 16 || hi == 0) { + DEBUG_PRINT("Failed bsearch in 16 steps. Corrupt data?"); + state->error_metric = metricID_UnwindNativeErrLookupIterations; + return ERR_NATIVE_EXCEEDED_DELTA_LOOKUP_ITERATIONS; + } + // After bsearch, 'hi' points to the first entry greater than the requested. + idx = hi; + } + + // The code above found the first entry with greater address than requested, + // so it needs to be decremented by one to get the entry with equal-or-less. + // This makes also the logic work cross-pages: if the first entry in within + // the page is too large, this actually gets the entry from the previous page. + idx--; + + StackDelta *delta = bpf_map_lookup_elem(inner_map, &idx); + if (!delta) { + state->error_metric = metricID_UnwindNativeErrLookupRange; + return ERR_NATIVE_LOOKUP_RANGE; + } + + DEBUG_PRINT( + "delta index %d, addrLow 0x%x, unwindInfo %d", idx, delta->addrLow, delta->unwindInfo); + + // Calculate PC delta from stack delta for merged delta comparison + int deltaOffset = (int)page_offset - (int)delta->addrLow; + if (idx < info->firstDelta) { + // PC is below the first delta of the corresponding page. This means that + // delta->addrLow contains address relative to one page before the page_offset. + // Fix up the deltaOffset with this difference of base pages. + deltaOffset += 1 << STACK_DELTA_PAGE_BITS; + } + + *addrDiff = deltaOffset; + *unwindInfo = delta->unwindInfo; + + if (delta->unwindInfo == STACK_DELTA_INVALID) { + state->error_metric = metricID_UnwindNativeErrStackDeltaInvalid; + return ERR_NATIVE_STACK_DELTA_INVALID; + } + if (delta->unwindInfo == STACK_DELTA_STOP) { + increment_metric(metricID_UnwindNativeStackDeltaStop); + } + + return ERR_OK; +} + +// unwind_calc_register calculates the given basic register expression of +// format "BASE_REG + param". +static EBPF_INLINE u64 unwind_calc_register(UnwindState *state, u8 baseReg, s32 param) +{ + return state->regs[baseReg % (sizeof(state->regs) / sizeof(state->regs[0]))] + param; +} + +#if defined(__x86_64__) + +// unwind_calc_register_with_deref calculates the expression as: +// - basic expression "BASE_REG + param" +// - expression with a dereference "*(BASE_REG + preDeref) + postDeref" +static EBPF_INLINE u64 +unwind_calc_register_with_deref(UnwindState *state, u8 baseReg, s32 param, bool deref) +{ + s32 preDeref = param, postDeref = 0; + + if (deref) { + // For expressions that dereference the base expression, the parameter is constructed + // of pre-dereference and post-derefence operands. Unpack those. + preDeref &= ~UNWIND_DEREF_MASK; + postDeref = (param & UNWIND_DEREF_MASK) * UNWIND_DEREF_MULTIPLIER; + } + + // Resolve the "BASE + param" before potential derereference + u64 addr = unwind_calc_register(state, baseReg, preDeref); + if (!deref) { + // All done: return "BASE + param" + return addr; + } + + // Dereference, and add the postDereference adder. + unsigned long val; + if (bpf_probe_read_user(&val, sizeof(val), (void *)addr)) { + DEBUG_PRINT("unwind failed to dereference address 0x%lx", (unsigned long)addr); + return 0; + } + // Return: "*(BASE + preDeref) + postDeref" + return val + postDeref; +} +#endif + +// Stack unwinding in the absence of frame pointers can be a bit involved, so +// this comment explains what the following code does. +// +// One begins unwinding a frame somewhere in the middle of execution. +// On x86_64, registers RIP (PC), RSP (SP), and RBP (FP) are available. +// +// This function resolves a "stack delta" command from from our internal maps. +// This stack delta refers to a rule on how to unwind the state. In the simple +// case it just provides SP delta and potentially offset from where to recover +// FP value. See unwind_calc_register[_with_deref]() on the expressions supported. +// +// The function sets the bool pointed to by the given `stop` pointer to `false` +// if the main ebpf unwinder should exit. This is the case if the current PC +// is marked with UNWIND_COMMAND_STOP which marks entry points (main function, +// thread spawn function, signal handlers, ...). +#if defined(__x86_64__) +static EBPF_INLINE ErrorCode unwind_one_frame(PerCPURecord *record, bool *stop) +{ + *stop = false; + + UnwindState *state = &record->state; + u32 unwindInfo = 0; + int addrDiff = 0; + u64 cfa = 0; + + // The relevant executable is compiled with frame pointer omission, so + // stack deltas need to be retrieved from the relevant map. + ErrorCode error = get_stack_delta(state, &addrDiff, &unwindInfo); + if (error) { + return error; + } + + if (unwindInfo & STACK_DELTA_COMMAND_FLAG) { + switch (unwindInfo & ~STACK_DELTA_COMMAND_FLAG) { + case UNWIND_COMMAND_PLT: + // The toolchains routinely emit a fixed DWARF expression to unwind the full + // PLT table with one expression to reduce .eh_frame size. + // This is the hard coded implementation of this expression. For further details, + // see https://hal.inria.fr/hal-02297690/document, page 4. (DOI: 10.1145/3360572) + cfa = state->sp + 8 + ((((state->pc & 15) >= 11) ? 1 : 0) << 3); + DEBUG_PRINT("PLT, cfa=0x%lx", (unsigned long)cfa); + break; + case UNWIND_COMMAND_SIGNAL: { + // Use the PerCPURecord scratch union instead of a stack-local buffer to avoid + // exceeding the 512-byte BPF stack limit when inlined into interpreters. + u64 *rt_regs = record->rt_regs; + // The rt_sigframe is defined at: + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/include/asm/sigframe.h?h=v6.4#n59 + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/include/uapi/asm/sigcontext.h?h=v6.4#n238 + // offsetof(struct rt_sigframe, uc.uc_mcontext) = 40 + if (bpf_probe_read_user(rt_regs, sizeof(record->rt_regs), (void *)(state->sp + 40))) { + goto err_native_pc_read; + } + state->rax = rt_regs[13]; + state->r9 = rt_regs[1]; + state->r11 = rt_regs[3]; + state->r13 = rt_regs[5]; + state->r15 = rt_regs[7]; + state->fp = rt_regs[10]; + state->sp = rt_regs[15]; + state->pc = rt_regs[16]; + + state->return_address = false; + DEBUG_PRINT("signal frame"); + goto frame_ok; + } + case UNWIND_COMMAND_STOP: *stop = true; return ERR_OK; + case UNWIND_COMMAND_FRAME_POINTER: + if (!unwinder_unwind_frame_pointer(state)) { + goto err_native_pc_read; + } + goto frame_ok; + default: return ERR_UNREACHABLE; + } + } else { + UnwindInfo *info = bpf_map_lookup_elem(&unwind_info_array, &unwindInfo); + if (!info) { + increment_metric(metricID_UnwindNativeErrBadUnwindInfoIndex); + return ERR_NATIVE_BAD_UNWIND_INFO_INDEX; + } + + s32 param = info->param; + if (info->mergeOpcode) { + DEBUG_PRINT("AddrDiff %d, merged delta %#02x", addrDiff, info->mergeOpcode); + if (addrDiff >= (info->mergeOpcode & ~MERGEOPCODE_NEGATIVE)) { + param += (info->mergeOpcode & MERGEOPCODE_NEGATIVE) ? -8 : 8; + DEBUG_PRINT("Merged delta match: cfaDelta=%d", unwindInfo); + } + } + + // Resolve the frame's CFA (previous PC is fixed to CFA) address, and + // the previous FP address if any. + state->cfa = cfa = unwind_calc_register_with_deref( + state, info->baseReg, param, (info->flags & UNWIND_FLAG_DEREF_CFA) != 0); + u64 fpa = unwind_calc_register(state, info->auxBaseReg, info->auxParam); + + if (fpa) { + bpf_probe_read_user(&state->fp, sizeof(state->fp), (void *)fpa); + } else if (info->baseReg == UNWIND_REG_FP) { + // FP used for recovery, but no new FP value received, clear FP + state->fp = 0; + } + } + + if (!cfa || bpf_probe_read_user(&state->pc, sizeof(state->pc), (void *)(cfa - 8))) { + err_native_pc_read: + increment_metric(metricID_UnwindNativeErrPCRead); + return ERR_NATIVE_PC_READ; + } + state->sp = cfa; + unwinder_mark_nonleaf_frame(state); +frame_ok: + increment_metric(metricID_UnwindNativeFrames); + return ERR_OK; +} +#elif defined(__aarch64__) +static EBPF_INLINE ErrorCode unwind_one_frame(PerCPURecord *record, bool *stop) +{ + *stop = false; + + UnwindState *state = &record->state; + u32 unwindInfo = 0; + int addrDiff = 0; + + // The relevant executable is compiled with frame pointer omission, so + // stack deltas need to be retrieved from the relevant map. + ErrorCode error = get_stack_delta(state, &addrDiff, &unwindInfo); + if (error) { + return error; + } + + if (unwindInfo & STACK_DELTA_COMMAND_FLAG) { + switch (unwindInfo & ~STACK_DELTA_COMMAND_FLAG) { + case UNWIND_COMMAND_SIGNAL: { + // Use the PerCPURecord scratch union instead of a stack-local buffer to avoid + // exceeding the 512-byte BPF stack limit when inlined into interpreters. + u64 *rt_regs = record->rt_regs; + // On aarch64 the struct rt_sigframe is at: + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/kernel/signal.c?h=v6.4#n39 + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/include/uapi/asm/sigcontext.h?h=v6.4#n28 + // offsetof(struct rt_sigframe, uc.uc_mcontext.regs[0]) = 312 + // offsetof(struct rt_sigframe, uc) 128 + + // offsetof(struct ucontext, uc_mcontext) 176 + + // offsetof(struct sigcontext, regs[0]) 8 + if (bpf_probe_read_user(rt_regs, sizeof(record->rt_regs), (void *)(state->sp + 312))) { + goto err_native_pc_read; + } + state->pc = normalize_pac_ptr(rt_regs[32]); + state->sp = rt_regs[31]; + state->fp = rt_regs[29]; + state->lr = normalize_pac_ptr(rt_regs[30]); + state->r20 = rt_regs[20]; + state->r22 = rt_regs[22]; + state->r28 = rt_regs[28]; + + state->return_address = false; + state->lr_invalid = false; + DEBUG_PRINT("signal frame"); + goto frame_ok; + } + case UNWIND_COMMAND_STOP: *stop = true; return ERR_OK; + case UNWIND_COMMAND_FRAME_POINTER: + if (!unwinder_unwind_frame_pointer(state)) { + goto err_native_pc_read; + } + goto frame_ok; + default: return ERR_UNREACHABLE; + } + } + + UnwindInfo *info = bpf_map_lookup_elem(&unwind_info_array, &unwindInfo); + if (!info) { + increment_metric(metricID_UnwindNativeErrBadUnwindInfoIndex); + DEBUG_PRINT("Giving up due to invalid unwind info array index"); + return ERR_NATIVE_BAD_UNWIND_INFO_INDEX; + } + + s32 param = info->param; + if (info->mergeOpcode) { + DEBUG_PRINT("AddrDiff %d, merged delta %#02x", addrDiff, info->mergeOpcode); + if (addrDiff >= (info->mergeOpcode & ~MERGEOPCODE_NEGATIVE)) { + param += (info->mergeOpcode & MERGEOPCODE_NEGATIVE) ? -8 : 8; + DEBUG_PRINT("Merged delta match: cfaDelta=%d", unwindInfo); + } + } + + // Resolve the frame CFA (previous PC is fixed to CFA) address + state->cfa = unwind_calc_register(state, info->baseReg, param); + + // Resolve Return Address, it is either the value of link register or + // stack address where RA is stored + u64 ra = unwind_calc_register(state, info->auxBaseReg, info->auxParam); + if (!ra) { + if (info->auxBaseReg == UNWIND_REG_LR) { + increment_metric(metricID_UnwindNativeLr0); + } else { + err_native_pc_read: + increment_metric(metricID_UnwindNativeErrPCRead); + } + // report failure to resolve RA and stop unwinding + DEBUG_PRINT("Giving up due to failure to resolve RA"); + return ERR_NATIVE_PC_READ; + } + + if (info->auxBaseReg == UNWIND_REG_LR) { + // Allow LR unwinding only if it's known to be valid: either because + // it's the topmost user-mode frame, or recovered by signal trampoline. + if (state->lr_invalid) { + increment_metric(metricID_UnwindNativeErrLrUnwindingMidTrace); + return ERR_NATIVE_LR_UNWINDING_MID_TRACE; + } + } else { + DEBUG_PRINT("RA: %016llX", (u64)ra); + + // read the value of RA from stack + int err; + u64 fpra[2]; + fpra[0] = state->fp; + if (info->flags & UNWIND_FLAG_FRAME) { + err = bpf_probe_read_user(fpra, sizeof(fpra), (void *)(ra - 8)); + } else { + err = bpf_probe_read_user(&fpra[1], sizeof(fpra[0]), (void *)ra); + } + if (err) { + goto err_native_pc_read; + } + state->fp = fpra[0]; + ra = fpra[1]; + } + state->pc = normalize_pac_ptr(ra); + state->sp = state->cfa; + unwinder_mark_nonleaf_frame(state); +frame_ok: + increment_metric(metricID_UnwindNativeFrames); + return ERR_OK; +} +#else + #error unsupported architecture +#endif + +#endif diff --git a/support/ebpf/python_tracer.ebpf.c b/support/ebpf/python_tracer.ebpf.c index cab564897..1fc64d611 100644 --- a/support/ebpf/python_tracer.ebpf.c +++ b/support/ebpf/python_tracer.ebpf.c @@ -2,15 +2,11 @@ #include "bpfdefs.h" #include "errors.h" +#include "native_stack_trace.h" #include "tracemgmt.h" #include "tsd.h" #include "types.h" -// The number of Python frames to unwind per frame-unwinding eBPF program. If -// we start running out of instructions in the walk_python_stack program, one -// option is to adjust this number downwards. -#define FRAMES_PER_WALK_PYTHON_STACK 12 - // Forward declaration to avoid warnings like // "declaration of 'struct pt_regs' will not be visible outside of this function [-Wvisibility]". struct pt_regs; @@ -141,8 +137,10 @@ static EBPF_INLINE ErrorCode process_python_frame( } // Read PyCodeObject - if (bpf_probe_read_user(pss->code, sizeof(pss->code), py_codeobject)) { - DEBUG_PRINT("Failed to read PyCodeObject at 0x%lx", (unsigned long)(py_codeobject)); + long pycode_err = bpf_probe_read_user(pss->code, sizeof(pss->code), py_codeobject); + if (pycode_err) { + DEBUG_PRINT( + "Failed to read PyCodeObject at 0x%lx err=%ld", (unsigned long)(py_codeobject), pycode_err); increment_metric(metricID_UnwindPythonErrBadCodeObjectArgCountAddr); return ERR_PYTHON_BAD_CODE_OBJECT_ADDR; } @@ -169,39 +167,6 @@ static EBPF_INLINE ErrorCode process_python_frame( return ERR_OK; } -static EBPF_INLINE ErrorCode -walk_python_stack(PerCPURecord *record, const PyProcInfo *pyinfo, int *unwinder) -{ - void *py_frame = record->pythonUnwindState.py_frame; - ErrorCode error = ERR_OK; - *unwinder = PROG_UNWIND_STOP; - - for (u32 i = 0; i < FRAMES_PER_WALK_PYTHON_STACK; ++i) { - bool continue_with_next; - error = process_python_frame(record, pyinfo, &py_frame, &continue_with_next); - if (error) { - goto stop; - } - if (continue_with_next) { - *unwinder = get_next_unwinder_after_interpreter(); - goto stop; - } - if (!py_frame) { - goto stop; - } - } - - *unwinder = PROG_UNWIND_PYTHON; - -stop: - // Set up the state for the next invocation of this unwinding program. - if (error || !py_frame) { - unwinder_mark_done(record, PROG_UNWIND_PYTHON); - } - record->pythonUnwindState.py_frame = py_frame; - return error; -} - // get_PyThreadState retrieves the PyThreadState* for the current thread. // // Python 3.12 and earlier set the thread_state using pthread_setspecific with the key @@ -288,6 +253,60 @@ static EBPF_INLINE ErrorCode get_PyFrame(const PyProcInfo *pyinfo, void **frame) return ERR_OK; } +// Number of loop iterations in unwind_python. Each iteration handles either +// one Python frame or one native frame depending on the current unwinder state. +// This is a RODATA variable so the host agent can tune it based on whether +// debug output is enabled (which affects the verifier instruction budget). +BPF_RODATA_VAR(u32, python_native_loop_iters, 6) + +// step_python processes one Python frame and updates *unwinder to indicate +// what should happen next +static EBPF_INLINE ErrorCode +step_python(PerCPURecord *record, const PyProcInfo *pyinfo, void **py_frame, int *unwinder) +{ + bool continue_with_next; + ErrorCode error = process_python_frame(record, pyinfo, py_frame, &continue_with_next); + if (error) { + *unwinder = PROG_UNWIND_STOP; + return error; + } + if (continue_with_next) { + *unwinder = get_next_unwinder_after_interpreter(); + } else if (!*py_frame) { + *unwinder = PROG_UNWIND_STOP; + } else { + *unwinder = PROG_UNWIND_PYTHON; + } + return ERR_OK; +} + +// step_native processes one native frame at an interpreter boundary and +// updates *unwinder +static EBPF_INLINE ErrorCode step_native(PerCPURecord *record, int *unwinder) +{ + Trace *trace = &record->trace; + *unwinder = PROG_UNWIND_STOP; + + increment_metric(metricID_UnwindNativeAttempts); + ErrorCode error = push_native( + &record->state, + trace, + record->state.text_section_id, + record->state.text_section_offset, + record->state.return_address); + if (error) { + return error; + } + + bool stop; + error = unwind_one_frame(record, &stop); + if (error || stop) { + return error; + } + + return get_next_unwinder_after_native_frame(record, unwinder); +} + // unwind_python is the entry point for tracing when invoked from the native tracer // or interpreter dispatcher. It does not reset the trace object and will append the // Python stack frames to the trace object for the current CPU. @@ -298,7 +317,7 @@ static EBPF_INLINE int unwind_python(struct pt_regs *ctx) return -1; ErrorCode error = ERR_OK; - int unwinder = get_next_unwinder_after_interpreter(); + int unwinder = PROG_UNWIND_PYTHON; Trace *trace = &record->trace; u32 pid = trace->pid; @@ -327,7 +346,26 @@ static EBPF_INLINE int unwind_python(struct pt_regs *ctx) goto exit; } - error = walk_python_stack(record, pyinfo, &unwinder); + { + void *py_frame = record->pythonUnwindState.py_frame; + + for (u32 t = 0; t < python_native_loop_iters; t++) { + switch (unwinder) { + case PROG_UNWIND_PYTHON: error = step_python(record, pyinfo, &py_frame, &unwinder); break; + case PROG_UNWIND_NATIVE: error = step_native(record, &unwinder); break; + default: goto done; + } + if (error) { + goto done; + } + } + + done: + if (error || !py_frame) { + unwinder_mark_done(record, PROG_UNWIND_PYTHON); + } + record->pythonUnwindState.py_frame = py_frame; + } exit: record->state.unwind_error = error; diff --git a/support/ebpf/tracer.ebpf.amd64 b/support/ebpf/tracer.ebpf.amd64 index 9a44839d7..987f97024 100644 Binary files a/support/ebpf/tracer.ebpf.amd64 and b/support/ebpf/tracer.ebpf.amd64 differ diff --git a/support/ebpf/tracer.ebpf.arm64 b/support/ebpf/tracer.ebpf.arm64 index 4d55359b6..63784c640 100644 Binary files a/support/ebpf/tracer.ebpf.arm64 and b/support/ebpf/tracer.ebpf.arm64 differ diff --git a/support/ebpf/types.h b/support/ebpf/types.h index 048e3c02c..dc9477719 100644 --- a/support/ebpf/types.h +++ b/support/ebpf/types.h @@ -829,6 +829,13 @@ typedef struct PerCPURecord { GoMapBucket goMapBucket; // Scratch for Go 1.24 labels struct GoString labels[MAX_CUSTOM_LABELS * 2]; + // Signal frame registers for unwind_one_frame (avoids 272-byte stack alloc on arm64). + // Sized to match the kernel rt_sigframe register array for the target architecture. +#if defined(__x86_64__) + u64 rt_regs[18]; +#elif defined(__aarch64__) + u64 rt_regs[34]; +#endif }; // Mask to indicate which unwinders are complete u32 unwindersDone; diff --git a/tools/coredump/testdata/amd64/deep-python-call.json b/tools/coredump/testdata/amd64/deep-python-call.json new file mode 100644 index 000000000..b51a65c9f --- /dev/null +++ b/tools/coredump/testdata/amd64/deep-python-call.json @@ -0,0 +1,174 @@ +{ + "coredump-ref": "79a46c0367a921375076930fe211405a635c0985d1f631102d3dfeda7652d8a3", + "threads": [ + { + "lwp": 1236391, + "frames": [ + "libc.so.6+0xfa3d4", + "python3.12+0x4f94af", + "python3.12+0x581bb9", + "python3.12+0x549934", + "_make_levels..last_call+5 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:49", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "_make_levels..make_call..__call__+0 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:38", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x54a781", + "python3.12+0x5a36a7", + "python3.12+0x548f34", + "main+2 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:56", + "+58 in /home/tpr/src/py-nat-hybrid/tools/coredump/testsources/python/deep_call.py:59", + "+0 in :1", + "python3.12+0x5d733e", + "python3.12+0x5d582a", + "python3.12+0x6088d1", + "python3.12+0x6b4922", + "python3.12+0x6b4689", + "python3.12+0x6b44be", + "python3.12+0x6bc524", + "python3.12+0x6bc00c", + "libc.so.6+0x2a1c9", + "libc.so.6+0x2a28a", + "python3.12+0x657444" + ] + } + ], + "modules": [ + { + "ref": "c2c20b4745d447551221ec3d4e70f92c270c4609fe3df34fc52ea6dd46e92273", + "local-path": "/usr/bin/python3.12" + }, + { + "ref": "d8db8739a1633c972cec6a4fe0566bdcec6fd088f98723492ab0361f66238f75", + "local-path": "/usr/lib/x86_64-linux-gnu/libc.so.6" + }, + { + "ref": "c42ff317838b4b4639e2ea801905f0317177c6df7e31b2f0d0240e3c3ac0cfde", + "local-path": "/usr/lib/x86_64-linux-gnu/libexpat.so.1.9.1" + }, + { + "ref": "9b64150b28505a33d6bc3ecf709c279f6de97a1c184dbda65d06ee4537f6d286", + "local-path": "/usr/lib/x86_64-linux-gnu/libz.so.1.3" + }, + { + "ref": "1b87a1a50b496cfead2b0ad134c2ff536705c82608db240c7e8aa48d6c0e4217", + "local-path": "/usr/lib/x86_64-linux-gnu/libm.so.6" + }, + { + "ref": "1cd555ac46b7887edeaf3c42aac5408c8135e52f6b37870da2cf82d5fe14e829", + "local-path": "/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2" + } + ] +} diff --git a/tools/coredump/testsources/python/deep_call.py b/tools/coredump/testsources/python/deep_call.py new file mode 100644 index 000000000..4142bc810 --- /dev/null +++ b/tools/coredump/testsources/python/deep_call.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +"""Generate a deep Python call stack with interleaved Python/C frames. + +Each Python function call goes through CPython's C eval loop +(_PyFunction_Vectorcall → _PyEval_Vector → _PyEval_EvalFrame → +_PyEval_EvalFrameDefault → do_call_core), creating ~5 native frames +between every Python frame. With 20 class.__call__ levels, this +produces ~100 native frames interleaved with ~20 Python frames, +requiring ~40 Python↔native unwinder transitions. + +On main (tail-call design), this exceeds the 29 tail call limit and +truncates the stack. With the combined Python+native loop, the full +stack is unwound. +""" +import os +import signal +import traceback + +class Level: + """Each subclass's __call__ invokes the next level via slot_tp_call.""" + pass + +# Generate 20 levels of classes that chain-call each other. +# Each __call__ goes through CPython's slot_tp_call (C) which creates +# native frames between the Python frames. +NUM_LEVELS = 20 + +def _make_levels(): + levels = [] + for i in range(NUM_LEVELS): + levels.append(type(f'Level{i}', (Level,), {})) + + # Wire up: each level's __call__ invokes the next level + for i in range(NUM_LEVELS - 1): + next_cls = levels[i + 1] + # Use a closure to capture next_cls + def make_call(nxt): + def __call__(self): + return nxt()() + return __call__ + levels[i].__call__ = make_call(next_cls) + + # Last level: print the stack and hang so we can capture a coredump + def last_call(self): + print(f"Reached level {NUM_LEVELS - 1}, PID={os.getpid()}", flush=True) + print("Stack trace:", flush=True) + traceback.print_stack() + print(f"\nWaiting for coredump (kill -ILL {os.getpid()})...", flush=True) + signal.pause() + + levels[-1].__call__ = last_call + return levels + +def main(): + levels = _make_levels() + levels[0]()() + +if __name__ == '__main__': + main() diff --git a/tracer/systemconfig.go b/tracer/systemconfig.go index 51781dd6a..c423d75a5 100644 --- a/tracer/systemconfig.go +++ b/tracer/systemconfig.go @@ -302,6 +302,13 @@ func loadRodataVars(coll *cebpf.CollectionSpec, kmod *kallsyms.Module, cfg *Conf if err := coll.Variables["with_debug_output"].Set(uint32(1)); err != nil { return fmt.Errorf("failed to set debug output: %v", err) } + } else { + // Without debug output the verifier skips DEBUG_PRINT branches, + // leaving enough instruction budget to increase the Python + // unwinder loop iterations (default 7 -> 12). + if err := coll.Variables["python_native_loop_iters"].Set(uint32(12)); err != nil { + return fmt.Errorf("failed to set python_native_loop_iters: %v", err) + } } if err := coll.Variables["off_cpu_threshold"].Set(cfg.OffCPUThreshold); err != nil {