9696 * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
9797 * /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding]
9898 *
99- * The padding size (0x100) is chosen to accommodate typical unwind info sizes
100- * while maintaining 16-byte alignment requirements.
99+ * The padding size is now calculated automatically during initialization
100+ * based on the actual unwind information requirements.
101101 */
102- #define PERF_JIT_CODE_PADDING 0x100
103102
104103/* Convenient access to the global trampoline API state */
105104#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
@@ -400,10 +399,12 @@ enum {
400399 DWRF_CFA_nop = 0x0 , // No operation
401400 DWRF_CFA_offset_extended = 0x5 , // Extended offset instruction
402401 DWRF_CFA_def_cfa = 0xc , // Define CFA rule
402+ DWRF_CFA_def_cfa_register = 0xd , // Define CFA register
403403 DWRF_CFA_def_cfa_offset = 0xe , // Define CFA offset
404404 DWRF_CFA_offset_extended_sf = 0x11 , // Extended signed offset
405405 DWRF_CFA_advance_loc = 0x40 , // Advance location counter
406- DWRF_CFA_offset = 0x80 // Simple offset instruction
406+ DWRF_CFA_offset = 0x80 , // Simple offset instruction
407+ DWRF_CFA_restore = 0xc0 // Restore register
407408};
408409
409410/* DWARF Exception Handling pointer encodings */
@@ -518,6 +519,7 @@ typedef struct ELFObjectContext {
518519 uint8_t * p ; // Current write position in buffer
519520 uint8_t * startp ; // Start of buffer (for offset calculations)
520521 uint8_t * eh_frame_p ; // Start of EH frame data (for relative offsets)
522+ uint8_t * fde_p ; // Start of FDE data (for PC-relative calculations)
521523 uint32_t code_size ; // Size of the code being described
522524} ELFObjectContext ;
523525
@@ -642,6 +644,8 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
642644// DWARF EH FRAME GENERATION
643645// =============================================================================
644646
647+ static void elf_init_ehframe (ELFObjectContext * ctx );
648+
645649/*
646650 * Initialize DWARF .eh_frame section for a code region
647651 *
@@ -656,6 +660,23 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
656660 * Args:
657661 * ctx: ELF object context containing code size and buffer pointers
658662 */
663+ static size_t calculate_eh_frame_size (void ) {
664+ /* Calculate the EH frame size for the trampoline function */
665+ extern void * _Py_trampoline_func_start ;
666+ extern void * _Py_trampoline_func_end ;
667+
668+ size_t code_size = (char * )& _Py_trampoline_func_end - (char * )& _Py_trampoline_func_start ;
669+
670+ ELFObjectContext ctx ;
671+ char buffer [1024 ]; // Buffer for DWARF data (1KB should be sufficient)
672+ ctx .code_size = code_size ;
673+ ctx .startp = ctx .p = (uint8_t * )buffer ;
674+ ctx .fde_p = NULL ;
675+
676+ elf_init_ehframe (& ctx );
677+ return ctx .p - ctx .startp ;
678+ }
679+
659680static void elf_init_ehframe (ELFObjectContext * ctx ) {
660681 uint8_t * p = ctx -> p ;
661682 uint8_t * framep = p ; // Remember start of frame data
@@ -783,7 +804,7 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
783804 *
784805 * DWRF_SECTION(FDE,
785806 * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here)
786- * DWRF_U32(-0x30 ); // Initial PC-relative location of the code
807+ * DWRF_U32(pc_relative_offset ); // PC-relative location of the code (calculated dynamically)
787808 * DWRF_U32(ctx->code_size); // Code range covered by this FDE
788809 * DWRF_U8(0); // Augmentation data length (none)
789810 *
@@ -829,19 +850,31 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
829850 DWRF_U32 (0 ); // CIE ID (0 indicates this is a CIE)
830851 DWRF_U8 (DWRF_CIE_VERSION ); // CIE version (1)
831852 DWRF_STR ("zR" ); // Augmentation string ("zR" = has LSDA)
832- DWRF_UV (1 ); // Code alignment factor
853+ #ifdef __x86_64__
854+ DWRF_UV (1 ); // Code alignment factor (x86_64: 1 byte)
855+ #elif defined(__aarch64__ ) && defined(__AARCH64EL__ ) && !defined(__ILP32__ )
856+ DWRF_UV (4 ); // Code alignment factor (AArch64: 4 bytes per instruction)
857+ #endif
833858 DWRF_SV (- (int64_t )sizeof (uintptr_t )); // Data alignment factor (negative)
834859 DWRF_U8 (DWRF_REG_RA ); // Return address register number
835860 DWRF_UV (1 ); // Augmentation data length
836861 DWRF_U8 (DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4 ); // FDE pointer encoding
837862
838863 /* Initial CFI instructions - describe default calling convention */
864+ #ifdef __x86_64__
865+ /* x86_64 initial CFI state */
839866 DWRF_U8 (DWRF_CFA_def_cfa ); // Define CFA (Call Frame Address)
840867 DWRF_UV (DWRF_REG_SP ); // CFA = SP register
841868 DWRF_UV (sizeof (uintptr_t )); // CFA = SP + pointer_size
842869 DWRF_U8 (DWRF_CFA_offset |DWRF_REG_RA ); // Return address is saved
843870 DWRF_UV (1 ); // At offset 1 from CFA
844-
871+ #elif defined(__aarch64__ ) && defined(__AARCH64EL__ ) && !defined(__ILP32__ )
872+ /* AArch64 initial CFI state */
873+ DWRF_U8 (DWRF_CFA_def_cfa ); // Define CFA (Call Frame Address)
874+ DWRF_UV (DWRF_REG_SP ); // CFA = SP register
875+ DWRF_UV (0 ); // CFA = SP + 0 (AArch64 starts with offset 0)
876+ // No initial register saves in AArch64 CIE
877+ #endif
845878 DWRF_ALIGNNOP (sizeof (uintptr_t )); // Align to pointer boundary
846879 )
847880
@@ -852,11 +885,15 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
852885 *
853886 * The FDE describes unwinding information specific to this function.
854887 * It references the CIE and provides function-specific CFI instructions.
888+ *
889+ * The PC-relative offset is calculated after the entire EH frame is built
890+ * to ensure accurate positioning relative to the synthesized DSO layout.
855891 */
856892 DWRF_SECTION (FDE ,
857893 DWRF_U32 ((uint32_t )(p - framep )); // Offset to CIE (backwards reference)
858- DWRF_U32 (-0x30 ); // Machine code offset relative to .text
859- DWRF_U32 (ctx -> code_size ); // Address range covered by this FDE (code lenght)
894+ ctx -> fde_p = p ; // Remember where PC offset field is located for later calculation
895+ DWRF_U32 (0 ); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe)
896+ DWRF_U32 (ctx -> code_size ); // Address range covered by this FDE (code length)
860897 DWRF_U8 (0 ); // Augmentation data length (none)
861898
862899 /*
@@ -867,32 +904,36 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
867904 * conventions and register usage patterns.
868905 */
869906#ifdef __x86_64__
870- /* x86_64 calling convention unwinding rules */
907+ /* x86_64 calling convention unwinding rules with frame pointer */
871908# if defined(__CET__ ) && (__CET__ & 1 )
872- DWRF_U8 (DWRF_CFA_advance_loc | 8 ); // Advance location by 8 bytes when CET protection is enabled
873- # else
874- DWRF_U8 (DWRF_CFA_advance_loc | 4 ); // Advance location by 4 bytes
909+ DWRF_U8 (DWRF_CFA_advance_loc | 4 ); // Advance past endbr64 (4 bytes)
875910# endif
876- DWRF_U8 (DWRF_CFA_def_cfa_offset ); // Redefine CFA offset
911+ DWRF_U8 (DWRF_CFA_advance_loc | 1 ); // Advance past push %rbp (1 byte)
912+ DWRF_U8 (DWRF_CFA_def_cfa_offset ); // def_cfa_offset 16
877913 DWRF_UV (16 ); // New offset: SP + 16
878- DWRF_U8 (DWRF_CFA_advance_loc | 6 ); // Advance location by 6 bytes
879- DWRF_U8 (DWRF_CFA_def_cfa_offset ); // Redefine CFA offset
914+ DWRF_U8 (DWRF_CFA_offset | DWRF_REG_BP ); // offset r6 at cfa-16
915+ DWRF_UV (2 ); // Offset factor: 2 * 8 = 16 bytes
916+ DWRF_U8 (DWRF_CFA_advance_loc | 3 ); // Advance past mov %rsp,%rbp (3 bytes)
917+ DWRF_U8 (DWRF_CFA_def_cfa_register ); // def_cfa_register r6
918+ DWRF_UV (DWRF_REG_BP ); // Use base pointer register
919+ DWRF_U8 (DWRF_CFA_advance_loc | 3 ); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3
920+ DWRF_U8 (DWRF_CFA_def_cfa ); // def_cfa r7 ofs 8
921+ DWRF_UV (DWRF_REG_SP ); // Use stack pointer register
880922 DWRF_UV (8 ); // New offset: SP + 8
881923#elif defined(__aarch64__ ) && defined(__AARCH64EL__ ) && !defined(__ILP32__ )
882924 /* AArch64 calling convention unwinding rules */
883- DWRF_U8 (DWRF_CFA_advance_loc | 1 ); // Advance location by 1 instruction (stp x29, x30)
884- DWRF_U8 (DWRF_CFA_def_cfa_offset ); // Redefine CFA offset
885- DWRF_UV (16 ); // CFA = SP + 16 (stack pointer after push)
886- DWRF_U8 (DWRF_CFA_offset | DWRF_REG_FP ); // Frame pointer (x29) saved
887- DWRF_UV (2 ); // At offset 2 from CFA (2 * 8 = 16 bytes)
888- DWRF_U8 (DWRF_CFA_offset | DWRF_REG_RA ); // Link register (x30) saved
889- DWRF_UV (1 ); // At offset 1 from CFA (1 * 8 = 8 bytes)
890- DWRF_U8 (DWRF_CFA_advance_loc | 3 ); // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...)
891- DWRF_U8 (DWRF_CFA_offset | DWRF_REG_FP ); // Restore frame pointer (x29)
892- DWRF_U8 (DWRF_CFA_offset | DWRF_REG_RA ); // Restore link register (x30)
893- DWRF_U8 (DWRF_CFA_def_cfa_offset ); // Final CFA adjustment
894- DWRF_UV (0 ); // CFA = SP + 0 (stack restored)
895-
925+ DWRF_U8 (DWRF_CFA_advance_loc | 1 ); // Advance by 1 instruction (4 bytes)
926+ DWRF_U8 (DWRF_CFA_def_cfa_offset ); // CFA = SP + 16
927+ DWRF_UV (16 ); // Stack pointer moved by 16 bytes
928+ DWRF_U8 (DWRF_CFA_offset | DWRF_REG_FP ); // x29 (frame pointer) saved
929+ DWRF_UV (2 ); // At CFA-16 (2 * 8 = 16 bytes from CFA)
930+ DWRF_U8 (DWRF_CFA_offset | DWRF_REG_RA ); // x30 (link register) saved
931+ DWRF_UV (1 ); // At CFA-8 (1 * 8 = 8 bytes from CFA)
932+ DWRF_U8 (DWRF_CFA_advance_loc | 3 ); // Advance by 3 instructions (12 bytes)
933+ DWRF_U8 (DWRF_CFA_restore | DWRF_REG_RA ); // Restore x30 - NO DWRF_UV() after this!
934+ DWRF_U8 (DWRF_CFA_restore | DWRF_REG_FP ); // Restore x29 - NO DWRF_UV() after this!
935+ DWRF_U8 (DWRF_CFA_def_cfa_offset ); // CFA = SP + 0 (stack restored)
936+ DWRF_UV (0 ); // Back to original stack position
896937#else
897938# error "Unsupported target architecture"
898939#endif
@@ -901,6 +942,58 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
901942 )
902943
903944 ctx -> p = p ; // Update context pointer to end of generated data
945+
946+ /* Calculate and update the PC-relative offset in the FDE
947+ *
948+ * When perf processes the jitdump, it creates a synthesized DSO with this layout:
949+ *
950+ * Synthesized DSO Memory Layout:
951+ * ┌─────────────────────────────────────────────────────────────┐ < code_start
952+ * │ Code Section │
953+ * │ (round_up(code_size, 8) bytes) │
954+ * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data
955+ * │ EH Frame Data │
956+ * │ ┌─────────────────────────────────────────────────────┐ │
957+ * │ │ CIE data │ │
958+ * │ └─────────────────────────────────────────────────────┘ │
959+ * │ ┌─────────────────────────────────────────────────────┐ │
960+ * │ │ FDE Header: │ │
961+ * │ │ - CIE offset (4 bytes) │ │
962+ * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start
963+ * │ │ - address range (4 bytes) │ │ (this specific field)
964+ * │ │ CFI Instructions... │ │
965+ * │ └─────────────────────────────────────────────────────┘ │
966+ * ├─────────────────────────────────────────────────────────────┤ < reference_point
967+ * │ EhFrameHeader │
968+ * │ (navigation metadata) │
969+ * └─────────────────────────────────────────────────────────────┘
970+ *
971+ * The PC offset field in the FDE must contain the distance from itself to code_start:
972+ *
973+ * distance = code_start - fde_pc_field
974+ *
975+ * Where:
976+ * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame
977+ * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8)
978+ *
979+ * Therefore:
980+ * distance = code_start_location - fde_pc_field_location
981+ * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame)
982+ * = -rounded_code_size - fde_offset_in_frame
983+ * = -(round_up(code_size, 8) + fde_offset_in_frame)
984+ *
985+ * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field,
986+ *
987+ */
988+ if (ctx -> fde_p != NULL ) {
989+ int32_t fde_offset_in_frame = (ctx -> fde_p - ctx -> startp );
990+ int32_t rounded_code_size = round_up (ctx -> code_size , 8 );
991+ int32_t pc_relative_offset = - (rounded_code_size + fde_offset_in_frame );
992+
993+
994+ // Update the PC-relative offset in the FDE
995+ * (int32_t * )ctx -> fde_p = pc_relative_offset ;
996+ }
904997}
905998
906999// =============================================================================
@@ -1001,8 +1094,10 @@ static void* perf_map_jit_init(void) {
10011094 /* Initialize code ID counter */
10021095 perf_jit_map_state .code_id = 0 ;
10031096
1004- /* Configure trampoline API with padding information */
1005- trampoline_api .code_padding = PERF_JIT_CODE_PADDING ;
1097+ /* Calculate padding size based on actual unwind info requirements */
1098+ size_t eh_frame_size = calculate_eh_frame_size ();
1099+ size_t unwind_data_size = sizeof (EhFrameHeader ) + eh_frame_size ;
1100+ trampoline_api .code_padding = round_up (unwind_data_size , 16 );
10061101
10071102 return & perf_jit_map_state ;
10081103}
@@ -1091,6 +1186,7 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
10911186 char buffer [1024 ]; // Buffer for DWARF data (1KB should be sufficient)
10921187 ctx .code_size = code_size ;
10931188 ctx .startp = ctx .p = (uint8_t * )buffer ;
1189+ ctx .fde_p = NULL ; // Initialize to NULL, will be set when FDE is written
10941190
10951191 /* Generate EH frame (Exception Handling frame) data */
10961192 elf_init_ehframe (& ctx );
@@ -1109,7 +1205,7 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
11091205 ev2 .unwind_data_size = sizeof (EhFrameHeader ) + eh_frame_size ;
11101206
11111207 /* Verify we don't exceed our padding budget */
1112- assert (ev2 .unwind_data_size <= PERF_JIT_CODE_PADDING );
1208+ assert (ev2 .unwind_data_size <= ( uint64_t ) trampoline_api . code_padding );
11131209
11141210 ev2 .eh_frame_hdr_size = sizeof (EhFrameHeader );
11151211 ev2 .mapped_size = round_up (ev2 .unwind_data_size , 16 ); // 16-byte alignment
@@ -1261,4 +1357,4 @@ _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
12611357 & perf_map_jit_fini , // Cleanup function
12621358};
12631359
1264- #endif /* PY_HAVE_PERF_TRAMPOLINE */
1360+ #endif /* PY_HAVE_PERF_TRAMPOLINE */
0 commit comments