@@ -71,6 +71,8 @@ typedef struct _PyCfgBasicblock {
7171 unsigned b_cold : 1 ;
7272 /* b_warm is used by the cold-detection algorithm to mark blocks which are definitely not cold */
7373 unsigned b_warm : 1 ;
74+ /* b_dfs_visited_gen is used by the borrow-safe analysis to mark whether a block has been visited */
75+ int b_dfs_visited_gen ;
7476} basicblock ;
7577
7678
@@ -95,6 +97,8 @@ typedef struct _PyCfgBuilder cfg_builder;
9597#define LOCATION (LNO , END_LNO , COL , END_COL ) \
9698 ((const _Py_SourceLocation){(LNO), (END_LNO), (COL), (END_COL)})
9799
100+ static int cfg_dfs_generation_counter = 0 ;
101+
98102static inline int
99103is_block_push (cfg_instr * i )
100104{
@@ -2701,6 +2705,267 @@ load_fast_push_block(basicblock ***sp, basicblock *target,
27012705 }
27022706}
27032707
2708+ /*
2709+ * Recursively determine if a borrowed reference remains safe along all CFG paths.
2710+ *
2711+ * This function performs a Depth-First Search (DFS) starting from 'current_block'.
2712+ * It tracks a specific value that was notionally loaded by a LOAD_FAST_BORROW
2713+ * instruction and is currently at 'depth_of_value_on_entry' on the operand stack
2714+ * (0 being TOS, -1 if already known to be consumed). The 'original_local_idx'
2715+ * is the index of the local variable from which this value was borrowed.
2716+ *
2717+ * A borrow is considered UNSAFE if, along any execution path before the
2718+ * borrowed value is consumed from the stack:
2719+ * 1. The 'original_local_idx' (the backing store for the borrow) is overwritten
2720+ * (e.g., by STORE_FAST, DELETE_FAST on that local).
2721+ * 2. The borrowed value itself, when at the top of the stack (depth 0), is
2722+ * consumed by an instruction that stores it into any local variable
2723+ * (e.g., STORE_FAST).
2724+ * 3. The value is not consumed and the CFG path ends (e.g., end of function
2725+ * without consumption) or enters a cycle where it's not consumed.
2726+ *
2727+ * The function uses 'cfg_dfs_generation_counter' in conjunction with
2728+ * 'current_block->b_dfs_visited_gen' to detect cycles within the current
2729+ * specific DFS traversal, preventing infinite loops and deeming such cyclic
2730+ * paths unsafe if the value isn't consumed within the cycle.
2731+ *
2732+ * Stack depths are tracked by:
2733+ * - 'depth_of_value_on_entry': The depth of the tracked item upon entering 'current_block'.
2734+ * - 'current_target_depth': The depth of the item as instructions in 'current_block' are processed.
2735+ * - 'depth_before_jump_op_effect': When a jump instruction is encountered, this
2736+ * is calculated by simulating all prior instructions in 'current_block' to find
2737+ * the item's depth *just before* the jump instruction itself has any stack effect.
2738+ * This precise depth is then used to calculate the 'target_entry_depth' for the
2739+ * recursive call to the jump's target block.
2740+ *
2741+ * Args:
2742+ * current_block: The basic block to start analysis from for this recursive step.
2743+ * depth_of_value_on_entry: The depth of the tracked borrowed value from TOS
2744+ * (0 = TOS, -1 if already consumed).
2745+ * original_local_idx: The index of the local variable backing the borrow.
2746+ * g: The CFG builder.
2747+ *
2748+ * Returns:
2749+ * true if the borrow is safe along all paths from this point, false otherwise.
2750+ */
2751+ static bool
2752+ is_borrow_safe (
2753+ basicblock * current_block ,
2754+ Py_ssize_t depth_of_value_on_entry ,
2755+ int original_local_idx ,
2756+ cfg_builder * g )
2757+ {
2758+ if (depth_of_value_on_entry == -1 ) {
2759+ return true;
2760+ }
2761+
2762+ if (current_block -> b_dfs_visited_gen == cfg_dfs_generation_counter ) {
2763+ return false;
2764+ }
2765+ current_block -> b_dfs_visited_gen = cfg_dfs_generation_counter ;
2766+
2767+ Py_ssize_t current_target_depth = depth_of_value_on_entry ;
2768+
2769+ for (int i = 0 ; i < current_block -> b_iused ; i ++ ) {
2770+ cfg_instr * instr = & current_block -> b_instr [i ];
2771+ int opcode = instr -> i_opcode ;
2772+ int oparg = instr -> i_oparg ;
2773+
2774+ // 1. Check if the original local is killed before the value is consumed.
2775+ if ((opcode == STORE_FAST || opcode == DELETE_FAST || opcode == LOAD_FAST_AND_CLEAR ) &&
2776+ oparg == original_local_idx ) {
2777+ return false;
2778+ }
2779+
2780+ // 2. Simulate stack effect and check for consumption or unsafe store.
2781+ stack_effects effects_noj ;
2782+ if (get_stack_effects (opcode , oparg , 0 , & effects_noj ) < 0 ) {
2783+ return false; // Error in stack effect calculation
2784+ }
2785+ int num_popped = _PyOpcode_num_popped (opcode , oparg );
2786+ int num_pushed = _PyOpcode_num_pushed (opcode , oparg );
2787+
2788+ if (current_target_depth < num_popped ) {
2789+ if (opcode == STORE_FAST && current_target_depth == 0 ) {
2790+ // Unsafe: borrowed value was at TOS and is being stored into a local.
2791+ return false;
2792+ }
2793+ return true; // Safely consumed by this instruction.
2794+ }
2795+ // Value not consumed, update its depth.
2796+ current_target_depth = current_target_depth - num_popped + num_pushed ;
2797+
2798+ // 3. Handle branches (jumps are always last in a basic block).
2799+ if (HAS_TARGET (opcode )) {
2800+ if (i != current_block -> b_iused - 1 ) {
2801+ continue ; // Skip jumps that are not the last instruction in the block.
2802+ }
2803+ bool safe_on_all_branches = true;
2804+
2805+ // Calculate item's depth just before this jump instruction's own stack effect.
2806+ Py_ssize_t depth_before_jump_op_effect = depth_of_value_on_entry ;
2807+ for (int k = 0 ; k < i ; ++ k ) { // Iterate instructions before the current jump
2808+ cfg_instr * prev_instr_in_block = & current_block -> b_instr [k ];
2809+ // Kill check for intermediate instructions
2810+ if ((prev_instr_in_block -> i_opcode == STORE_FAST ||
2811+ prev_instr_in_block -> i_opcode == DELETE_FAST ||
2812+ prev_instr_in_block -> i_opcode == LOAD_FAST_AND_CLEAR ) &&
2813+ prev_instr_in_block -> i_oparg == original_local_idx ) {
2814+ return false;
2815+ }
2816+ stack_effects prev_effects ;
2817+ if (get_stack_effects (prev_instr_in_block -> i_opcode , prev_instr_in_block -> i_oparg , 0 , & prev_effects ) < 0 ) {
2818+ return false;
2819+ }
2820+ int prev_popped_val = _PyOpcode_num_popped (prev_instr_in_block -> i_opcode , prev_instr_in_block -> i_oparg );
2821+ if (depth_before_jump_op_effect < prev_popped_val ) { // Consumed before jump
2822+ if (prev_instr_in_block -> i_opcode == STORE_FAST && depth_before_jump_op_effect == 0 ) {
2823+ return false; // Stored into local
2824+ }
2825+ return true; // Safely consumed before the jump
2826+ }
2827+ depth_before_jump_op_effect = depth_before_jump_op_effect - prev_popped_val +
2828+ _PyOpcode_num_pushed (prev_instr_in_block -> i_opcode , prev_instr_in_block -> i_oparg );
2829+ }
2830+
2831+ // Analyze jump target path
2832+ stack_effects effects_jump ;
2833+ if (get_stack_effects (opcode , oparg , 1 , & effects_jump ) < 0 ) return false;
2834+ int jump_popped_val = _PyOpcode_num_popped (opcode , oparg );
2835+ Py_ssize_t target_entry_depth = depth_before_jump_op_effect ;
2836+
2837+ if (target_entry_depth < jump_popped_val ) { // Consumed by the jump op itself
2838+ if (opcode == STORE_FAST && target_entry_depth == 0 ) {
2839+ safe_on_all_branches = false;
2840+ }
2841+ // else: safely consumed by jump operation.
2842+ } else { // Not consumed by jump op, recurse on target.
2843+ target_entry_depth = target_entry_depth - jump_popped_val + _PyOpcode_num_pushed (opcode , oparg );
2844+ if (!is_borrow_safe (instr -> i_target , target_entry_depth , original_local_idx , g )) {
2845+ safe_on_all_branches = false;
2846+ }
2847+ }
2848+
2849+ // Analyze fallthrough path for conditional jumps, if the jump path was safe.
2850+ if (safe_on_all_branches && IS_CONDITIONAL_JUMP_OPCODE (opcode ) && current_block -> b_next ) {
2851+ // 'current_target_depth' already reflects the stack state if the jump is *not* taken.
2852+ if (!is_borrow_safe (current_block -> b_next , current_target_depth , original_local_idx , g )) {
2853+ safe_on_all_branches = false;
2854+ }
2855+ }
2856+ return safe_on_all_branches ;
2857+ }
2858+ }
2859+
2860+ // When the instructions finish and the value isn't consumed, check fallthrough.
2861+ if (current_block -> b_next && BB_HAS_FALLTHROUGH (current_block )) {
2862+ return is_borrow_safe (current_block -> b_next , current_target_depth , original_local_idx , g );
2863+ }
2864+
2865+ // No further path (or no fallthrough) and value is still on stack (current_target_depth is valid).
2866+ // This means it's unconsumed at the end of a CFG path.
2867+ return false;
2868+ }
2869+
2870+
2871+ /*
2872+ * Initiates a Control Flow Graph (CFG) wide safety check for a borrowed reference.
2873+ *
2874+ * This function is called when a LOAD_FAST instruction is a candidate for
2875+ * LOAD_FAST_BORROW, but its produced value ('REF_UNCONSUMED' flag is set)
2876+ * might live beyond the current basic block ('producer_block').
2877+ *
2878+ * It determines the immediate successor(s) of the 'producer_block' and the
2879+ * stack depth at which the borrowed value (identified by 'original_local_idx'
2880+ * and initially at 'depth_of_value_at_producer_end' from TOS) would enter
2881+ * these successors.
2882+ *
2883+ * It then calls 'is_borrow_safe()' for each successor path. The borrow is
2884+ * considered globally safe only if 'is_borrow_safe()' returns true for ALL
2885+ * possible successor paths.
2886+ *
2887+ * A new DFS traversal is started by incrementing 'cfg_dfs_generation_counter'
2888+ * to ensure that 'is_borrow_safe()' uses a fresh set of visited markers
2889+ * ('b_dfs_visited_gen') for its analysis.
2890+ *
2891+ * Args:
2892+ * producer_block: The basic block containing the LOAD_FAST candidate.
2893+ * depth_of_value_at_producer_end: The depth of the candidate borrowed value
2894+ * from TOS at the end of 'producer_block'.
2895+ * (0 = TOS, -1 if already consumed - though
2896+ * this function expects a live value).
2897+ * original_local_idx: The index of the local variable backing the borrow.
2898+ * g: The CFG builder.
2899+ *
2900+ * Returns:
2901+ * true if the borrow is safe across all subsequent CFG paths, false otherwise.
2902+ */
2903+ static bool
2904+ check_borrow_safety_globally (
2905+ basicblock * producer_block ,
2906+ Py_ssize_t depth_of_value_at_producer_end ,
2907+ int original_local_idx ,
2908+ cfg_builder * g )
2909+ {
2910+ cfg_dfs_generation_counter ++ ;
2911+ bool overall_safety = true;
2912+
2913+ cfg_instr * last_instr = basicblock_last_instr (producer_block );
2914+
2915+ // If depth is -1, implies value was already consumed or is invalid for tracking.
2916+ if (depth_of_value_at_producer_end == -1 ) {
2917+ return false;
2918+ }
2919+
2920+ if (last_instr && HAS_TARGET (last_instr -> i_opcode )) {
2921+ stack_effects effects_jump ;
2922+ if (get_stack_effects (last_instr -> i_opcode , last_instr -> i_oparg , 1 , & effects_jump ) < 0 ) return false;
2923+ int jump_popped = _PyOpcode_num_popped (last_instr -> i_opcode , last_instr -> i_oparg );
2924+ Py_ssize_t entry_depth_for_target = depth_of_value_at_producer_end ;
2925+
2926+ if (entry_depth_for_target < jump_popped ) { // Consumed by the jump itself.
2927+ if (last_instr -> i_opcode == STORE_FAST && entry_depth_for_target == 0 ) {
2928+ overall_safety = false; // Unsafe store of borrowed value.
2929+ }
2930+ // else: safely consumed.
2931+ } else {
2932+ entry_depth_for_target = entry_depth_for_target - jump_popped +
2933+ _PyOpcode_num_pushed (last_instr -> i_opcode , last_instr -> i_oparg );
2934+ if (!is_borrow_safe (last_instr -> i_target , entry_depth_for_target , original_local_idx , g )) {
2935+ overall_safety = false;
2936+ }
2937+ }
2938+
2939+ // Analyze fallthrough path for conditional jumps, if the jump path was safe.
2940+ if (overall_safety && IS_CONDITIONAL_JUMP_OPCODE (last_instr -> i_opcode ) && producer_block -> b_next ) {
2941+ stack_effects effects_fall ; // Stack effect if jump is NOT taken.
2942+ if (get_stack_effects (last_instr -> i_opcode , last_instr -> i_oparg , 0 , & effects_fall ) < 0 ) return false;
2943+ int fall_popped = _PyOpcode_num_popped (last_instr -> i_opcode , last_instr -> i_oparg );
2944+ Py_ssize_t entry_depth_for_fallthrough = depth_of_value_at_producer_end ;
2945+
2946+ if (entry_depth_for_fallthrough < fall_popped ) { // Consumed by not taking jump.
2947+ if (last_instr -> i_opcode == STORE_FAST && entry_depth_for_fallthrough == 0 ) {
2948+ overall_safety = false; // Unsafe store.
2949+ }
2950+ // else: safely consumed.
2951+ } else { // Not consumed by not taking jump, recurse on fallthrough.
2952+ entry_depth_for_fallthrough = entry_depth_for_fallthrough - fall_popped +
2953+ _PyOpcode_num_pushed (last_instr -> i_opcode , last_instr -> i_oparg );
2954+ if (!is_borrow_safe (producer_block -> b_next , entry_depth_for_fallthrough , original_local_idx , g )) {
2955+ overall_safety = false;
2956+ }
2957+ }
2958+ }
2959+ } else if (producer_block -> b_next && BB_HAS_FALLTHROUGH (producer_block )) { // Standard fallthrough, no jump.
2960+ if (!is_borrow_safe (producer_block -> b_next , depth_of_value_at_producer_end , original_local_idx , g )) {
2961+ overall_safety = false;
2962+ }
2963+ } else { // No successors (e.g., block ends in RETURN/RAISE) and value still on stack.
2964+ overall_safety = false; // Unconsumed at end of a CFG path.
2965+ }
2966+ return overall_safety ;
2967+ }
2968+
27042969/*
27052970 * Strength reduce LOAD_FAST{_LOAD_FAST} instructions into faster variants that
27062971 * load borrowed references onto the operand stack.
@@ -2747,6 +3012,7 @@ optimize_load_fast(cfg_builder *g)
27473012 basicblock * entryblock = g -> g_entryblock ;
27483013 for (basicblock * b = entryblock ; b != NULL ; b = b -> b_next ) {
27493014 max_instrs = Py_MAX (max_instrs , b -> b_iused );
3015+ b -> b_dfs_visited_gen = 0 ;
27503016 }
27513017 size_t instr_flags_size = max_instrs * sizeof (uint8_t );
27523018 uint8_t * instr_flags = PyMem_Malloc (instr_flags_size );
@@ -2976,17 +3242,82 @@ optimize_load_fast(cfg_builder *g)
29763242
29773243 // Optimize instructions
29783244 for (int i = 0 ; i < block -> b_iused ; i ++ ) {
2979- if (!(instr_flags [i ] & (SUPPORT_KILLED | STORED_AS_LOCAL ))) {
2980- cfg_instr * instr = & block -> b_instr [i ];
2981- switch (instr -> i_opcode ) {
2982- case LOAD_FAST :
3245+ cfg_instr * instr = & block -> b_instr [i ];
3246+ bool is_load_fast_type = (instr -> i_opcode == LOAD_FAST || instr -> i_opcode == LOAD_FAST_LOAD_FAST );
3247+
3248+ if (is_load_fast_type ) {
3249+ bool killed_or_stored_locally = (instr_flags [i ] & (SUPPORT_KILLED | STORED_AS_LOCAL ));
3250+ if (killed_or_stored_locally ) {
3251+ continue ; // Definitely cannot borrow due to local block issues
3252+ }
3253+
3254+ bool unconsumed_in_block = (instr_flags [i ] & REF_UNCONSUMED );
3255+ bool can_borrow = false;
3256+
3257+ if (!unconsumed_in_block ) {
3258+ can_borrow = true; // Safe by local analysis, consumed in current block
3259+ } else {
3260+ if (instr -> i_opcode == LOAD_FAST ) {
3261+ int local_idx = instr -> i_oparg ;
3262+ Py_ssize_t depth_from_tos_at_block_end = -1 ;
3263+ // Find the specific item on the `refs` stack (at end of current block simulation)
3264+ for (Py_ssize_t k = 0 ; k < refs .size ; k ++ ) {
3265+ ref r = ref_stack_at (& refs , k );
3266+ if (r .instr == i && r .local == local_idx ) { // Match instruction and local
3267+ depth_from_tos_at_block_end = refs .size - 1 - k ;
3268+ break ;
3269+ }
3270+ }
3271+
3272+ if (depth_from_tos_at_block_end != -1 ) {
3273+ can_borrow = check_borrow_safety_globally (block , depth_from_tos_at_block_end , local_idx , g );
3274+ } else {
3275+ // If REF_UNCONSUMED is set but we couldn't find its depth, assume unsafe.
3276+ // This can happen if refs.size is 0, yet flag is set.
3277+ can_borrow = false;
3278+ }
3279+
3280+ } else { // LOAD_FAST_LOAD_FAST
3281+ can_borrow = true; // Assume true, prove false if any part is unsafe
3282+ int local_idx1 = instr -> i_oparg >> 4 ;
3283+ int local_idx2 = instr -> i_oparg & 15 ;
3284+ Py_ssize_t depth1 = -1 , depth2 = -1 ;
3285+
3286+ // Find depths on `refs` stack for both products of this LFLF instruction `i`
3287+ for (Py_ssize_t k = 0 ; k < refs .size ; k ++ ) {
3288+ ref r = ref_stack_at (& refs , k );
3289+ if (r .instr == i ) {
3290+ Py_ssize_t current_depth = refs .size - 1 - k ;
3291+ if (r .local == local_idx1 && depth1 == -1 ) depth1 = current_depth ;
3292+ else if (r .local == local_idx2 && depth2 == -1 ) depth2 = current_depth ;
3293+ }
3294+ }
3295+
3296+ bool found_any_on_stack = (depth1 != -1 || depth2 != -1 );
3297+
3298+ if (depth1 != -1 ) {
3299+ if (!check_borrow_safety_globally (block , depth1 , local_idx1 , g )) {
3300+ can_borrow = false;
3301+ }
3302+ }
3303+ if (can_borrow && depth2 != -1 ) {
3304+ if (!check_borrow_safety_globally (block , depth2 , local_idx2 , g )) {
3305+ can_borrow = false;
3306+ }
3307+ }
3308+
3309+ if (unconsumed_in_block && !found_any_on_stack ) {
3310+ can_borrow = false;
3311+ }
3312+ }
3313+ }
3314+
3315+ if (can_borrow ) {
3316+ if (instr -> i_opcode == LOAD_FAST ) {
29833317 instr -> i_opcode = LOAD_FAST_BORROW ;
2984- break ;
2985- case LOAD_FAST_LOAD_FAST :
3318+ } else if (instr -> i_opcode == LOAD_FAST_LOAD_FAST ) {
29863319 instr -> i_opcode = LOAD_FAST_BORROW_LOAD_FAST_BORROW ;
2987- break ;
2988- default :
2989- break ;
3320+ }
29903321 }
29913322 }
29923323 }
0 commit comments