Skip to content

Commit 2f0ffdd

Browse files
committed
Improve CFG optimization with new LOAD_FAST_BORROW safety checks.
1 parent 9df82ce commit 2f0ffdd

File tree

1 file changed

+340
-9
lines changed

1 file changed

+340
-9
lines changed

Python/flowgraph.c

Lines changed: 340 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ typedef struct _PyCfgBasicblock {
7171
unsigned b_cold : 1;
7272
/* b_warm is used by the cold-detection algorithm to mark blocks which are definitely not cold */
7373
unsigned b_warm : 1;
74+
/* b_dfs_visited_gen is used by the borrow-safe analysis to mark whether a block has been visited */
75+
int b_dfs_visited_gen;
7476
} basicblock;
7577

7678

@@ -95,6 +97,8 @@ typedef struct _PyCfgBuilder cfg_builder;
9597
#define LOCATION(LNO, END_LNO, COL, END_COL) \
9698
((const _Py_SourceLocation){(LNO), (END_LNO), (COL), (END_COL)})
9799

100+
static int cfg_dfs_generation_counter = 0;
101+
98102
static inline int
99103
is_block_push(cfg_instr *i)
100104
{
@@ -2701,6 +2705,267 @@ load_fast_push_block(basicblock ***sp, basicblock *target,
27012705
}
27022706
}
27032707

2708+
/*
2709+
* Recursively determine if a borrowed reference remains safe along all CFG paths.
2710+
*
2711+
* This function performs a Depth-First Search (DFS) starting from 'current_block'.
2712+
* It tracks a specific value that was notionally loaded by a LOAD_FAST_BORROW
2713+
* instruction and is currently at 'depth_of_value_on_entry' on the operand stack
2714+
* (0 being TOS, -1 if already known to be consumed). The 'original_local_idx'
2715+
* is the index of the local variable from which this value was borrowed.
2716+
*
2717+
* A borrow is considered UNSAFE if, along any execution path before the
2718+
* borrowed value is consumed from the stack:
2719+
* 1. The 'original_local_idx' (the backing store for the borrow) is overwritten
2720+
* (e.g., by STORE_FAST, DELETE_FAST on that local).
2721+
* 2. The borrowed value itself, when at the top of the stack (depth 0), is
2722+
* consumed by an instruction that stores it into any local variable
2723+
* (e.g., STORE_FAST).
2724+
* 3. The value is not consumed and the CFG path ends (e.g., end of function
2725+
* without consumption) or enters a cycle where it's not consumed.
2726+
*
2727+
* The function uses 'cfg_dfs_generation_counter' in conjunction with
2728+
* 'current_block->b_dfs_visited_gen' to detect cycles within the current
2729+
* specific DFS traversal, preventing infinite loops and deeming such cyclic
2730+
* paths unsafe if the value isn't consumed within the cycle.
2731+
*
2732+
* Stack depths are tracked by:
2733+
* - 'depth_of_value_on_entry': The depth of the tracked item upon entering 'current_block'.
2734+
* - 'current_target_depth': The depth of the item as instructions in 'current_block' are processed.
2735+
* - 'depth_before_jump_op_effect': When a jump instruction is encountered, this
2736+
* is calculated by simulating all prior instructions in 'current_block' to find
2737+
* the item's depth *just before* the jump instruction itself has any stack effect.
2738+
* This precise depth is then used to calculate the 'target_entry_depth' for the
2739+
* recursive call to the jump's target block.
2740+
*
2741+
* Args:
2742+
* current_block: The basic block to start analysis from for this recursive step.
2743+
* depth_of_value_on_entry: The depth of the tracked borrowed value from TOS
2744+
* (0 = TOS, -1 if already consumed).
2745+
* original_local_idx: The index of the local variable backing the borrow.
2746+
* g: The CFG builder.
2747+
*
2748+
* Returns:
2749+
* true if the borrow is safe along all paths from this point, false otherwise.
2750+
*/
2751+
static bool
2752+
is_borrow_safe(
2753+
basicblock *current_block,
2754+
Py_ssize_t depth_of_value_on_entry,
2755+
int original_local_idx,
2756+
cfg_builder *g)
2757+
{
2758+
if (depth_of_value_on_entry == -1) {
2759+
return true;
2760+
}
2761+
2762+
if (current_block->b_dfs_visited_gen == cfg_dfs_generation_counter) {
2763+
return false;
2764+
}
2765+
current_block->b_dfs_visited_gen = cfg_dfs_generation_counter;
2766+
2767+
Py_ssize_t current_target_depth = depth_of_value_on_entry;
2768+
2769+
for (int i = 0; i < current_block->b_iused; i++) {
2770+
cfg_instr *instr = &current_block->b_instr[i];
2771+
int opcode = instr->i_opcode;
2772+
int oparg = instr->i_oparg;
2773+
2774+
// 1. Check if the original local is killed before the value is consumed.
2775+
if ((opcode == STORE_FAST || opcode == DELETE_FAST || opcode == LOAD_FAST_AND_CLEAR) &&
2776+
oparg == original_local_idx) {
2777+
return false;
2778+
}
2779+
2780+
// 2. Simulate stack effect and check for consumption or unsafe store.
2781+
stack_effects effects_noj;
2782+
if (get_stack_effects(opcode, oparg, 0, &effects_noj) < 0) {
2783+
return false; // Error in stack effect calculation
2784+
}
2785+
int num_popped = _PyOpcode_num_popped(opcode, oparg);
2786+
int num_pushed = _PyOpcode_num_pushed(opcode, oparg);
2787+
2788+
if (current_target_depth < num_popped) {
2789+
if (opcode == STORE_FAST && current_target_depth == 0) {
2790+
// Unsafe: borrowed value was at TOS and is being stored into a local.
2791+
return false;
2792+
}
2793+
return true; // Safely consumed by this instruction.
2794+
}
2795+
// Value not consumed, update its depth.
2796+
current_target_depth = current_target_depth - num_popped + num_pushed;
2797+
2798+
// 3. Handle branches (jumps are always last in a basic block).
2799+
if (HAS_TARGET(opcode)) {
2800+
if (i != current_block->b_iused - 1) {
2801+
continue; // Skip jumps that are not the last instruction in the block.
2802+
}
2803+
bool safe_on_all_branches = true;
2804+
2805+
// Calculate item's depth just before this jump instruction's own stack effect.
2806+
Py_ssize_t depth_before_jump_op_effect = depth_of_value_on_entry;
2807+
for(int k=0; k < i; ++k) { // Iterate instructions before the current jump
2808+
cfg_instr *prev_instr_in_block = &current_block->b_instr[k];
2809+
// Kill check for intermediate instructions
2810+
if ((prev_instr_in_block->i_opcode == STORE_FAST ||
2811+
prev_instr_in_block->i_opcode == DELETE_FAST ||
2812+
prev_instr_in_block->i_opcode == LOAD_FAST_AND_CLEAR) &&
2813+
prev_instr_in_block->i_oparg == original_local_idx) {
2814+
return false;
2815+
}
2816+
stack_effects prev_effects;
2817+
if (get_stack_effects(prev_instr_in_block->i_opcode, prev_instr_in_block->i_oparg, 0, &prev_effects) < 0) {
2818+
return false;
2819+
}
2820+
int prev_popped_val = _PyOpcode_num_popped(prev_instr_in_block->i_opcode, prev_instr_in_block->i_oparg);
2821+
if (depth_before_jump_op_effect < prev_popped_val) { // Consumed before jump
2822+
if (prev_instr_in_block->i_opcode == STORE_FAST && depth_before_jump_op_effect == 0) {
2823+
return false; // Stored into local
2824+
}
2825+
return true; // Safely consumed before the jump
2826+
}
2827+
depth_before_jump_op_effect = depth_before_jump_op_effect - prev_popped_val +
2828+
_PyOpcode_num_pushed(prev_instr_in_block->i_opcode, prev_instr_in_block->i_oparg);
2829+
}
2830+
2831+
// Analyze jump target path
2832+
stack_effects effects_jump;
2833+
if (get_stack_effects(opcode, oparg, 1, &effects_jump) < 0) return false;
2834+
int jump_popped_val = _PyOpcode_num_popped(opcode, oparg);
2835+
Py_ssize_t target_entry_depth = depth_before_jump_op_effect;
2836+
2837+
if (target_entry_depth < jump_popped_val) { // Consumed by the jump op itself
2838+
if (opcode == STORE_FAST && target_entry_depth == 0) {
2839+
safe_on_all_branches = false;
2840+
}
2841+
// else: safely consumed by jump operation.
2842+
} else { // Not consumed by jump op, recurse on target.
2843+
target_entry_depth = target_entry_depth - jump_popped_val + _PyOpcode_num_pushed(opcode, oparg);
2844+
if (!is_borrow_safe(instr->i_target, target_entry_depth, original_local_idx, g)) {
2845+
safe_on_all_branches = false;
2846+
}
2847+
}
2848+
2849+
// Analyze fallthrough path for conditional jumps, if the jump path was safe.
2850+
if (safe_on_all_branches && IS_CONDITIONAL_JUMP_OPCODE(opcode) && current_block->b_next) {
2851+
// 'current_target_depth' already reflects the stack state if the jump is *not* taken.
2852+
if (!is_borrow_safe(current_block->b_next, current_target_depth, original_local_idx, g)) {
2853+
safe_on_all_branches = false;
2854+
}
2855+
}
2856+
return safe_on_all_branches;
2857+
}
2858+
}
2859+
2860+
// When the instructions finish and the value isn't consumed, check fallthrough.
2861+
if (current_block->b_next && BB_HAS_FALLTHROUGH(current_block)) {
2862+
return is_borrow_safe(current_block->b_next, current_target_depth, original_local_idx, g);
2863+
}
2864+
2865+
// No further path (or no fallthrough) and value is still on stack (current_target_depth is valid).
2866+
// This means it's unconsumed at the end of a CFG path.
2867+
return false;
2868+
}
2869+
2870+
2871+
/*
2872+
* Initiates a Control Flow Graph (CFG) wide safety check for a borrowed reference.
2873+
*
2874+
* This function is called when a LOAD_FAST instruction is a candidate for
2875+
* LOAD_FAST_BORROW, but its produced value ('REF_UNCONSUMED' flag is set)
2876+
* might live beyond the current basic block ('producer_block').
2877+
*
2878+
* It determines the immediate successor(s) of the 'producer_block' and the
2879+
* stack depth at which the borrowed value (identified by 'original_local_idx'
2880+
* and initially at 'depth_of_value_at_producer_end' from TOS) would enter
2881+
* these successors.
2882+
*
2883+
* It then calls 'is_borrow_safe()' for each successor path. The borrow is
2884+
* considered globally safe only if 'is_borrow_safe()' returns true for ALL
2885+
* possible successor paths.
2886+
*
2887+
* A new DFS traversal is started by incrementing 'cfg_dfs_generation_counter'
2888+
* to ensure that 'is_borrow_safe()' uses a fresh set of visited markers
2889+
* ('b_dfs_visited_gen') for its analysis.
2890+
*
2891+
* Args:
2892+
* producer_block: The basic block containing the LOAD_FAST candidate.
2893+
* depth_of_value_at_producer_end: The depth of the candidate borrowed value
2894+
* from TOS at the end of 'producer_block'.
2895+
* (0 = TOS, -1 if already consumed - though
2896+
* this function expects a live value).
2897+
* original_local_idx: The index of the local variable backing the borrow.
2898+
* g: The CFG builder.
2899+
*
2900+
* Returns:
2901+
* true if the borrow is safe across all subsequent CFG paths, false otherwise.
2902+
*/
2903+
static bool
2904+
check_borrow_safety_globally(
2905+
basicblock *producer_block,
2906+
Py_ssize_t depth_of_value_at_producer_end,
2907+
int original_local_idx,
2908+
cfg_builder *g)
2909+
{
2910+
cfg_dfs_generation_counter++;
2911+
bool overall_safety = true;
2912+
2913+
cfg_instr *last_instr = basicblock_last_instr(producer_block);
2914+
2915+
// If depth is -1, implies value was already consumed or is invalid for tracking.
2916+
if (depth_of_value_at_producer_end == -1) {
2917+
return false;
2918+
}
2919+
2920+
if (last_instr && HAS_TARGET(last_instr->i_opcode)) {
2921+
stack_effects effects_jump;
2922+
if (get_stack_effects(last_instr->i_opcode, last_instr->i_oparg, 1, &effects_jump) < 0) return false;
2923+
int jump_popped = _PyOpcode_num_popped(last_instr->i_opcode, last_instr->i_oparg);
2924+
Py_ssize_t entry_depth_for_target = depth_of_value_at_producer_end;
2925+
2926+
if (entry_depth_for_target < jump_popped) { // Consumed by the jump itself.
2927+
if (last_instr->i_opcode == STORE_FAST && entry_depth_for_target == 0) {
2928+
overall_safety = false; // Unsafe store of borrowed value.
2929+
}
2930+
// else: safely consumed.
2931+
} else {
2932+
entry_depth_for_target = entry_depth_for_target - jump_popped +
2933+
_PyOpcode_num_pushed(last_instr->i_opcode, last_instr->i_oparg);
2934+
if (!is_borrow_safe(last_instr->i_target, entry_depth_for_target, original_local_idx, g)) {
2935+
overall_safety = false;
2936+
}
2937+
}
2938+
2939+
// Analyze fallthrough path for conditional jumps, if the jump path was safe.
2940+
if (overall_safety && IS_CONDITIONAL_JUMP_OPCODE(last_instr->i_opcode) && producer_block->b_next) {
2941+
stack_effects effects_fall; // Stack effect if jump is NOT taken.
2942+
if (get_stack_effects(last_instr->i_opcode, last_instr->i_oparg, 0, &effects_fall) < 0) return false;
2943+
int fall_popped = _PyOpcode_num_popped(last_instr->i_opcode, last_instr->i_oparg);
2944+
Py_ssize_t entry_depth_for_fallthrough = depth_of_value_at_producer_end;
2945+
2946+
if (entry_depth_for_fallthrough < fall_popped) { // Consumed by not taking jump.
2947+
if (last_instr->i_opcode == STORE_FAST && entry_depth_for_fallthrough == 0) {
2948+
overall_safety = false; // Unsafe store.
2949+
}
2950+
// else: safely consumed.
2951+
} else { // Not consumed by not taking jump, recurse on fallthrough.
2952+
entry_depth_for_fallthrough = entry_depth_for_fallthrough - fall_popped +
2953+
_PyOpcode_num_pushed(last_instr->i_opcode, last_instr->i_oparg);
2954+
if (!is_borrow_safe(producer_block->b_next, entry_depth_for_fallthrough, original_local_idx, g)) {
2955+
overall_safety = false;
2956+
}
2957+
}
2958+
}
2959+
} else if (producer_block->b_next && BB_HAS_FALLTHROUGH(producer_block)) { // Standard fallthrough, no jump.
2960+
if (!is_borrow_safe(producer_block->b_next, depth_of_value_at_producer_end, original_local_idx, g)) {
2961+
overall_safety = false;
2962+
}
2963+
} else { // No successors (e.g., block ends in RETURN/RAISE) and value still on stack.
2964+
overall_safety = false; // Unconsumed at end of a CFG path.
2965+
}
2966+
return overall_safety;
2967+
}
2968+
27042969
/*
27052970
* Strength reduce LOAD_FAST{_LOAD_FAST} instructions into faster variants that
27062971
* load borrowed references onto the operand stack.
@@ -2747,6 +3012,7 @@ optimize_load_fast(cfg_builder *g)
27473012
basicblock *entryblock = g->g_entryblock;
27483013
for (basicblock *b = entryblock; b != NULL; b = b->b_next) {
27493014
max_instrs = Py_MAX(max_instrs, b->b_iused);
3015+
b->b_dfs_visited_gen = 0;
27503016
}
27513017
size_t instr_flags_size = max_instrs * sizeof(uint8_t);
27523018
uint8_t *instr_flags = PyMem_Malloc(instr_flags_size);
@@ -2976,17 +3242,82 @@ optimize_load_fast(cfg_builder *g)
29763242

29773243
// Optimize instructions
29783244
for (int i = 0; i < block->b_iused; i++) {
2979-
if (!(instr_flags[i] & (SUPPORT_KILLED | STORED_AS_LOCAL))) {
2980-
cfg_instr *instr = &block->b_instr[i];
2981-
switch (instr->i_opcode) {
2982-
case LOAD_FAST:
3245+
cfg_instr *instr = &block->b_instr[i];
3246+
bool is_load_fast_type = (instr->i_opcode == LOAD_FAST || instr->i_opcode == LOAD_FAST_LOAD_FAST);
3247+
3248+
if (is_load_fast_type) {
3249+
bool killed_or_stored_locally = (instr_flags[i] & (SUPPORT_KILLED | STORED_AS_LOCAL));
3250+
if (killed_or_stored_locally) {
3251+
continue; // Definitely cannot borrow due to local block issues
3252+
}
3253+
3254+
bool unconsumed_in_block = (instr_flags[i] & REF_UNCONSUMED);
3255+
bool can_borrow = false;
3256+
3257+
if (!unconsumed_in_block) {
3258+
can_borrow = true; // Safe by local analysis, consumed in current block
3259+
} else {
3260+
if (instr->i_opcode == LOAD_FAST) {
3261+
int local_idx = instr->i_oparg;
3262+
Py_ssize_t depth_from_tos_at_block_end = -1;
3263+
// Find the specific item on the `refs` stack (at end of current block simulation)
3264+
for (Py_ssize_t k = 0; k < refs.size; k++) {
3265+
ref r = ref_stack_at(&refs, k);
3266+
if (r.instr == i && r.local == local_idx) { // Match instruction and local
3267+
depth_from_tos_at_block_end = refs.size - 1 - k;
3268+
break;
3269+
}
3270+
}
3271+
3272+
if (depth_from_tos_at_block_end != -1) {
3273+
can_borrow = check_borrow_safety_globally(block, depth_from_tos_at_block_end, local_idx, g);
3274+
} else {
3275+
// If REF_UNCONSUMED is set but we couldn't find its depth, assume unsafe.
3276+
// This can happen if refs.size is 0, yet flag is set.
3277+
can_borrow = false;
3278+
}
3279+
3280+
} else { // LOAD_FAST_LOAD_FAST
3281+
can_borrow = true; // Assume true, prove false if any part is unsafe
3282+
int local_idx1 = instr->i_oparg >> 4;
3283+
int local_idx2 = instr->i_oparg & 15;
3284+
Py_ssize_t depth1 = -1, depth2 = -1;
3285+
3286+
// Find depths on `refs` stack for both products of this LFLF instruction `i`
3287+
for (Py_ssize_t k = 0; k < refs.size; k++) {
3288+
ref r = ref_stack_at(&refs, k);
3289+
if (r.instr == i) {
3290+
Py_ssize_t current_depth = refs.size - 1 - k;
3291+
if (r.local == local_idx1 && depth1 == -1) depth1 = current_depth;
3292+
else if (r.local == local_idx2 && depth2 == -1) depth2 = current_depth;
3293+
}
3294+
}
3295+
3296+
bool found_any_on_stack = (depth1 != -1 || depth2 != -1);
3297+
3298+
if (depth1 != -1) {
3299+
if (!check_borrow_safety_globally(block, depth1, local_idx1, g)) {
3300+
can_borrow = false;
3301+
}
3302+
}
3303+
if (can_borrow && depth2 != -1) {
3304+
if (!check_borrow_safety_globally(block, depth2, local_idx2, g)) {
3305+
can_borrow = false;
3306+
}
3307+
}
3308+
3309+
if (unconsumed_in_block && !found_any_on_stack) {
3310+
can_borrow = false;
3311+
}
3312+
}
3313+
}
3314+
3315+
if (can_borrow) {
3316+
if (instr->i_opcode == LOAD_FAST) {
29833317
instr->i_opcode = LOAD_FAST_BORROW;
2984-
break;
2985-
case LOAD_FAST_LOAD_FAST:
3318+
} else if (instr->i_opcode == LOAD_FAST_LOAD_FAST) {
29863319
instr->i_opcode = LOAD_FAST_BORROW_LOAD_FAST_BORROW;
2987-
break;
2988-
default:
2989-
break;
3320+
}
29903321
}
29913322
}
29923323
}

0 commit comments

Comments
 (0)