-
Notifications
You must be signed in to change notification settings - Fork 280
Fix: Flatten recursion in SplitRecursively to prevent stack overflow #1127
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix: Flatten recursion in SplitRecursively to prevent stack overflow #1127
Conversation
georgeh0
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the PR! Overall looks quite good!
| let mut atom_collector = AtomChunksCollector { | ||
| full_text: self.full_text, | ||
| min_level: 0, | ||
| min_level: usize::MAX, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This means to be 0. Its initial value means to be the level at the boundary (this is also why we reset it to curr_level after starting a new chunk)
| struct AtomRoutingPlan { | ||
| start_idx: usize, // index of `atom_chunks` for the start chunk | ||
| prev_plan_idx: usize, // index of `plans` for the previous plan | ||
| start_idx: usize, | ||
| prev_plan_idx: usize, | ||
| cost: usize, | ||
| overlap_cost_base: usize, | ||
| } | ||
| type PrevPlanCandidate = (std::cmp::Reverse<usize>, usize); // (cost, start_idx) | ||
|
|
||
| if atom_chunks.is_empty() || atom_chunks.len() == 1 { | ||
| return Vec::new(); | ||
| } | ||
|
|
||
| let mut plans = Vec::with_capacity(atom_chunks.len()); | ||
| // Janitor | ||
| plans.push(AtomRoutingPlan { | ||
| start_idx: 0, | ||
| prev_plan_idx: 0, | ||
| cost: 0, | ||
| overlap_cost_base: self.get_overlap_cost_base(0), | ||
| }); | ||
| let mut prev_plan_candidates = std::collections::BinaryHeap::<PrevPlanCandidate>::new(); | ||
|
|
||
| let mut gap_cost_cache = vec![0]; | ||
| let mut syntax_level_gap_cost = |boundary: usize, internal: usize| -> usize { | ||
| if boundary > internal { | ||
| let gap = boundary - internal; | ||
| for i in gap_cost_cache.len()..=gap { | ||
| gap_cost_cache.push(gap_cost_cache[i - 1] + SYNTAX_LEVEL_GAP_COST / i); | ||
| } | ||
| gap_cost_cache[gap] | ||
| } else { | ||
| 0 | ||
| } | ||
| }; | ||
|
|
||
| for (i, chunk) in atom_chunks[0..atom_chunks.len() - 1].iter().enumerate() { | ||
| for i in 0..atom_chunks.len() - 1 { | ||
| let mut min_cost = usize::MAX; | ||
| let mut arg_min_start_idx: usize = 0; | ||
| let mut arg_min_prev_plan_idx: usize = 0; | ||
| let mut start_idx = i; | ||
|
|
||
| let end_syntax_level = atom_chunks[i + 1].boundary_syntax_level; | ||
| let end_lb_level = atom_chunks[i + 1].boundary_lb_level; | ||
|
|
||
| let mut internal_syntax_level = usize::MAX; | ||
| let mut internal_lb_level = LineBreakLevel::Inline; | ||
|
|
||
| fn lb_level_gap(boundary: LineBreakLevel, internal: LineBreakLevel) -> usize { | ||
| if boundary.ord() < internal.ord() { | ||
| internal.ord() - boundary.ord() | ||
| } else { | ||
| 0 | ||
| } | ||
| } | ||
|
|
||
| loop { | ||
| let start_chunk = &atom_chunks[start_idx]; | ||
| let chunk_size = chunk.range.end - start_chunk.range.start; | ||
| let current_chunk_end = atom_chunks[i].range.end; | ||
| let chunk_size = current_chunk_end - start_chunk.range.start; | ||
|
|
||
| let mut cost = 0; | ||
| cost += | ||
| syntax_level_gap_cost(start_chunk.boundary_syntax_level, internal_syntax_level); | ||
| cost += syntax_level_gap_cost(end_syntax_level, internal_syntax_level); | ||
| cost += (lb_level_gap(start_chunk.boundary_lb_level, internal_lb_level) | ||
| + lb_level_gap(end_lb_level, internal_lb_level)) | ||
| * PER_LINE_BREAK_LEVEL_GAP_COST; | ||
| if chunk_size < self.min_chunk_size { | ||
| cost += TOO_SMALL_CHUNK_COST; | ||
| } | ||
|
|
||
| if chunk_size > self.chunk_size { | ||
| if min_cost == usize::MAX { | ||
| min_cost = cost + plans[start_idx].cost; | ||
| arg_min_start_idx = start_idx; | ||
| arg_min_prev_plan_idx = start_idx; | ||
| } | ||
| break; | ||
| } | ||
|
|
||
| let prev_plan_idx = if self.chunk_overlap > 0 { | ||
| while let Some(top_prev_plan) = prev_plan_candidates.peek() { | ||
| let overlap_size = | ||
| atom_chunks[top_prev_plan.1].range.end - start_chunk.range.start; | ||
| if overlap_size <= self.chunk_overlap { | ||
| let mut best_prev_plan_idx = start_idx; | ||
| if self.chunk_overlap > 0 { | ||
| let mut min_prev_plan_cost = plans[start_idx].cost; | ||
| for k_idx in (0..start_idx).rev() { | ||
| let end_of_prev_chunk = if k_idx > 0 { | ||
| atom_chunks[k_idx - 1].range.end | ||
| } else { | ||
| 0 | ||
| }; | ||
| let overlap = end_of_prev_chunk.saturating_sub(start_chunk.range.start); | ||
| if overlap > self.chunk_overlap { | ||
| break; | ||
| } | ||
| prev_plan_candidates.pop(); | ||
| if plans[k_idx].cost < min_prev_plan_cost { | ||
| min_prev_plan_cost = plans[k_idx].cost; | ||
| best_prev_plan_idx = k_idx; | ||
| } | ||
| } | ||
| prev_plan_candidates.push(( | ||
| std::cmp::Reverse( | ||
| plans[start_idx].cost + plans[start_idx].overlap_cost_base, | ||
| ), | ||
| start_idx, | ||
| )); | ||
| prev_plan_candidates.peek().unwrap().1 | ||
| } else { | ||
| start_idx | ||
| }; | ||
| let prev_plan = &plans[prev_plan_idx]; | ||
| } | ||
|
|
||
| let prev_plan = &plans[best_prev_plan_idx]; | ||
| cost += prev_plan.cost; | ||
| if self.chunk_overlap == 0 { | ||
| cost += MISSING_OVERLAP_COST / 2; | ||
|
|
||
| let end_of_prev_chunk = if best_prev_plan_idx > 0 { | ||
| atom_chunks[best_prev_plan_idx - 1].range.end | ||
| } else { | ||
| let start_cost_base = self.get_overlap_cost_base(start_chunk.range.start); | ||
| cost += if prev_plan.overlap_cost_base < start_cost_base { | ||
| MISSING_OVERLAP_COST + prev_plan.overlap_cost_base - start_cost_base | ||
| } else { | ||
| MISSING_OVERLAP_COST | ||
| }; | ||
| 0 | ||
| }; | ||
| if end_of_prev_chunk < start_chunk.range.start { | ||
| cost += MISSING_OVERLAP_COST; | ||
| } | ||
|
|
||
| if cost < min_cost { | ||
| min_cost = cost; | ||
| arg_min_start_idx = start_idx; | ||
| arg_min_prev_plan_idx = prev_plan_idx; | ||
| arg_min_prev_plan_idx = best_prev_plan_idx; | ||
| } | ||
|
|
||
| if start_idx == 0 { | ||
| break; | ||
| } | ||
|
|
||
| start_idx -= 1; | ||
| internal_syntax_level = | ||
| internal_syntax_level.min(start_chunk.boundary_syntax_level); | ||
| internal_lb_level = internal_lb_level.max(start_chunk.internal_lb_level); | ||
| internal_syntax_level.min(atom_chunks[start_idx].boundary_syntax_level); | ||
| internal_lb_level = internal_lb_level.max(atom_chunks[start_idx].internal_lb_level); | ||
| } | ||
| plans.push(AtomRoutingPlan { | ||
| start_idx: arg_min_start_idx, | ||
| prev_plan_idx: arg_min_prev_plan_idx, | ||
| cost: min_cost, | ||
| overlap_cost_base: self.get_overlap_cost_base(chunk.range.end), | ||
| }); | ||
| prev_plan_candidates.clear(); | ||
| } | ||
|
|
||
| let mut output = Vec::new(); | ||
| let mut plan_idx = plans.len() - 1; | ||
| while plan_idx > 0 { | ||
| let plan = &plans[plan_idx]; | ||
| let start_chunk = &atom_chunks[plan.start_idx]; | ||
| let end_chunk = &atom_chunks[plan_idx - 1]; | ||
| let std_range: Range<usize> = start_chunk.range.start..end_chunk.range.end; | ||
| output.push(ChunkOutput { | ||
| start_pos: Position::new(start_chunk.range.start), | ||
| end_pos: Position::new(end_chunk.range.end), | ||
| text: &self.full_text[start_chunk.range.start..end_chunk.range.end], | ||
| text: &self.full_text[std_range], | ||
| }); | ||
| plan_idx = plan.prev_plan_idx; | ||
| } | ||
| output.reverse(); | ||
| output |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's avoid changing this function now? It changes the optimization target function and seems unnecessary.
| let mut iter_stack: Vec<Box<dyn Iterator<Item = Chunk<'t, 's>>>> = | ||
| vec![Box::new(std::iter::once(chunk))]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changes to this function look quite good!
Only a minor thing: We can avoid this Box<dyn Iterator<...>> thing by creating a enum with two variants wrapping these two specific iterators. We can implement Iterator on this enum, then everything else will just work. Box<dyn Iterator<...>> is a little bit heavy here as it has heap allocation per node.
|
Hi @georgeh0, thanks for the feedback! I've addressed all your suggestions and pushed the new commit. |
|
Hi @georgeh0 , @badmonster0 , I've pushed the requested changes. When you have a chance, could you please take another look? |
| start_idx: usize, | ||
| prev_plan_idx: usize, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we revert this change to put the comments back?
The same for all other changes in merge_atom_chunks() below. They're irrelevant.
|
@georgeh0 @badmonster0 Done! I've reverted the changes in |
|
Please fix the build error and make the tests pass. |
|
@georgeh0 @badmonster0 |
|
Still failed. Please run |
|
Hi @georgeh0, my apologies for the failing checks on the last push. I've found and fixed the final bug causing the test failure. I've confirmed that the full cargo test suite now passes successfully on my machine. The PR should be ready for another review. |
|
hi @siddharthbaleja7 latest release out and we had a section for you, thanks for contributing https://cocoindex.io/blogs/cocoindex-changelog-2025-10-19#siddharthbaleja7 |
Resolves #898
Problem
The previous implementation of
SplitRecursivelyused a recursive approach to traverse the AST. This caused a stack overflow crash when processing code with deeply nested structures, as noted in the issue. The temporary recursion limit was a stopgap that negatively affected the quality of the splits.Solution
This PR refactors the core logic in
collect_atom_chunksto be fully iterative.Vec<Box<dyn Iterator>>).Verification
All tests now pass successfully with
cargo test. The changes have been verified against deeply nested code structures that previously caused the application to hang or crash.Note: While this fixes the stack overflow crash, overall memory usage for extremely large files can still be high. This is inherent to the current chunk-merging algorithm and can be addressed in future performance optimizations.