Skip to content

Commit e7de51a

Browse files
prevent duplicate global assignments when reverting helpers
1 parent 02b4d65 commit e7de51a

File tree

4 files changed

+262
-2
lines changed

4 files changed

+262
-2
lines changed

codeflash/code_utils/code_replacer.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,11 +412,17 @@ def replace_function_definitions_in_module(
412412
module_abspath: Path,
413413
preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]],
414414
project_root_path: Path,
415+
global_assignments_added_before: bool = False, # noqa: FBT001, FBT002
415416
) -> bool:
416417
source_code: str = module_abspath.read_text(encoding="utf8")
417418
code_to_apply = get_optimized_code_for_module(module_abspath.relative_to(project_root_path), optimized_code)
419+
418420
new_code: str = replace_functions_and_add_imports(
419-
add_global_assignments(code_to_apply, source_code),
421+
# adding the global assignments before replacing the code, not after
422+
# becuase of an "edge case" where the optimized code intoduced a new import and a global assignment using that import
423+
# and that import wasn't used before, so it was ignored when calling AddImportsVisitor.add_needed_import inside replace_functions_and_add_imports (because the global assignment wasn't added yet)
424+
# this was added at https://github.com/codeflash-ai/codeflash/pull/448
425+
add_global_assignments(code_to_apply, source_code) if not global_assignments_added_before else source_code,
420426
function_names,
421427
code_to_apply,
422428
module_abspath,

codeflash/context/unused_definition_remover.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,7 @@ def revert_unused_helper_functions(
537537
module_abspath=file_path,
538538
preexisting_objects=set(), # Empty set since we're reverting
539539
project_root_path=project_root,
540+
global_assignments_added_before=True, # since we revert helpers functions after applying the optimization, we know that the file already has global assignments added, otherwise they would be added twice.
540541
)
541542

542543
if reverted_code:

codeflash/optimization/function_optimizer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,10 @@ def reformat_code_and_helpers(
820820
return new_code, new_helper_code
821821

822822
def replace_function_and_helpers_with_optimized_code(
823-
self, code_context: CodeOptimizationContext, optimized_code: CodeStringsMarkdown, original_helper_code: str
823+
self,
824+
code_context: CodeOptimizationContext,
825+
optimized_code: CodeStringsMarkdown,
826+
original_helper_code: dict[Path, str],
824827
) -> bool:
825828
did_update = False
826829
read_writable_functions_by_file_path = defaultdict(set)

tests/test_code_replacement.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3228,3 +3228,253 @@ def _map_tool_definition(f: ToolDefinition) -> ChatCompletionInputTool:
32283228
assert not re.search(r"^import aiohttp as aiohttp_\b", new_code, re.MULTILINE) # conditional alias import: import <name> as <alias>
32293229
assert not re.search(r"^from math import pi as PI, sin as sine\b", new_code, re.MULTILINE) # conditional multiple aliases imports
32303230
assert "from huggingface_hub import AsyncInferenceClient, ChatCompletionInputTool" not in new_code # conditional from import
3231+
3232+
3233+
def test_test():
3234+
root_dir = Path(__file__).parent.parent.resolve()
3235+
main_file = Path(root_dir / "code_to_optimize/temp_main.py").resolve()
3236+
3237+
original_code = '''"""Chunking objects not specific to a particular chunking strategy."""
3238+
3239+
from __future__ import annotations
3240+
3241+
import collections
3242+
import copy
3243+
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
3244+
3245+
import regex
3246+
from typing_extensions import Self, TypeAlias
3247+
3248+
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
3249+
from unstructured.documents.elements import (
3250+
CompositeElement,
3251+
ConsolidationStrategy,
3252+
Element,
3253+
ElementMetadata,
3254+
Table,
3255+
TableChunk,
3256+
Title,
3257+
)
3258+
from unstructured.utils import lazyproperty
3259+
3260+
# ================================================================================================
3261+
# MODEL
3262+
# ================================================================================================
3263+
3264+
CHUNK_MAX_CHARS_DEFAULT: int = 500
3265+
"""Hard-max chunk-length when no explicit value specified in `max_characters` argument.
3266+
3267+
Provided for reference only, for example so the ingest CLI can advertise the default value in its
3268+
UI. External chunking-related functions (e.g. in ingest or decorators) should use
3269+
`max_characters: int | None = None` and not apply this default themselves. Only
3270+
`ChunkingOptions.max_characters` should apply a default value.
3271+
"""
3272+
3273+
CHUNK_MULTI_PAGE_DEFAULT: bool = True
3274+
"""When False, respect page-boundaries (no two elements from different page in same chunk).
3275+
3276+
Only operative for "by_title" chunking strategy.
3277+
"""
3278+
3279+
BoundaryPredicate: TypeAlias = Callable[[Element], bool]
3280+
"""Detects when element represents crossing a semantic boundary like section or page."""
3281+
3282+
TextAndHtml: TypeAlias = tuple[str, str]
3283+
3284+
# ================================================================================================
3285+
# PRE-CHUNKER
3286+
# ================================================================================================
3287+
3288+
3289+
class PreChunker:
3290+
"""Gathers sequential elements into pre-chunks as length constraints allow.
3291+
3292+
The pre-chunker's responsibilities are:
3293+
3294+
- **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
3295+
either side of those boundaries into different sections. In this case, the primary indicator
3296+
of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
3297+
semantic boundary when `multipage_sections` is `False`.
3298+
3299+
- **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
3300+
into sections as big as possible without exceeding the chunk window size.
3301+
3302+
- **Minimize chunks that must be split mid-text.** Precompute the text length of each section
3303+
and only produce a section that exceeds the chunk window size when there is a single element
3304+
with text longer than that window.
3305+
3306+
A Table element is placed into a section by itself. CheckBox elements are dropped.
3307+
3308+
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
3309+
a new "section", hence the "by-title" designation.
3310+
"""
3311+
3312+
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
3313+
self._elements = elements
3314+
self._opts = opts
3315+
3316+
@lazyproperty
3317+
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
3318+
"""The semantic-boundary detectors to be applied to break pre-chunks."""
3319+
return self._opts.boundary_predicates
3320+
3321+
def _is_in_new_semantic_unit(self, element: Element) -> bool:
3322+
"""True when `element` begins a new semantic unit such as a section or page."""
3323+
# -- all detectors need to be called to update state and avoid double counting
3324+
# -- boundaries that happen to coincide, like Table and new section on same element.
3325+
# -- Using `any()` would short-circuit on first True.
3326+
semantic_boundaries = [pred(element) for pred in self._boundary_predicates]
3327+
return any(semantic_boundaries)
3328+
3329+
'''
3330+
main_file.write_text(original_code, encoding="utf-8")
3331+
optim_code = f'''```python:{main_file.relative_to(root_dir)}
3332+
# ================================================================================================
3333+
# PRE-CHUNKER
3334+
# ================================================================================================
3335+
3336+
from __future__ import annotations
3337+
from typing import Iterable
3338+
from unstructured.documents.elements import Element
3339+
from unstructured.utils import lazyproperty
3340+
3341+
class PreChunker:
3342+
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
3343+
self._elements = elements
3344+
self._opts = opts
3345+
3346+
@lazyproperty
3347+
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
3348+
"""The semantic-boundary detectors to be applied to break pre-chunks."""
3349+
return self._opts.boundary_predicates
3350+
3351+
def _is_in_new_semantic_unit(self, element: Element) -> bool:
3352+
"""True when `element` begins a new semantic unit such as a section or page."""
3353+
# Use generator expression for lower memory usage and avoid building intermediate list
3354+
for pred in self._boundary_predicates:
3355+
if pred(element):
3356+
return True
3357+
return False
3358+
```
3359+
'''
3360+
3361+
func = FunctionToOptimize(function_name="_is_in_new_semantic_unit", parents=[FunctionParent("PreChunker", "ClassDef")], file_path=main_file)
3362+
test_config = TestConfig(
3363+
tests_root=root_dir / "tests/pytest",
3364+
tests_project_rootdir=root_dir,
3365+
project_root_path=root_dir,
3366+
test_framework="pytest",
3367+
pytest_cmd="pytest",
3368+
)
3369+
func_optimizer = FunctionOptimizer(function_to_optimize=func, test_cfg=test_config)
3370+
code_context: CodeOptimizationContext = func_optimizer.get_code_optimization_context().unwrap()
3371+
3372+
original_helper_code: dict[Path, str] = {}
3373+
helper_function_paths = {hf.file_path for hf in code_context.helper_functions}
3374+
for helper_function_path in helper_function_paths:
3375+
with helper_function_path.open(encoding="utf8") as f:
3376+
helper_code = f.read()
3377+
original_helper_code[helper_function_path] = helper_code
3378+
3379+
func_optimizer.args = Args()
3380+
func_optimizer.replace_function_and_helpers_with_optimized_code(
3381+
code_context=code_context, optimized_code=CodeStringsMarkdown.parse_markdown_code(optim_code), original_helper_code=original_helper_code
3382+
)
3383+
3384+
3385+
new_code = main_file.read_text(encoding="utf-8")
3386+
main_file.unlink(missing_ok=True)
3387+
3388+
expected = '''"""Chunking objects not specific to a particular chunking strategy."""
3389+
3390+
from __future__ import annotations
3391+
3392+
import collections
3393+
import copy
3394+
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
3395+
3396+
import regex
3397+
from typing_extensions import Self, TypeAlias
3398+
3399+
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
3400+
from unstructured.documents.elements import (
3401+
CompositeElement,
3402+
ConsolidationStrategy,
3403+
Element,
3404+
ElementMetadata,
3405+
Table,
3406+
TableChunk,
3407+
Title,
3408+
)
3409+
from unstructured.utils import lazyproperty
3410+
3411+
# ================================================================================================
3412+
# MODEL
3413+
# ================================================================================================
3414+
3415+
CHUNK_MAX_CHARS_DEFAULT: int = 500
3416+
"""Hard-max chunk-length when no explicit value specified in `max_characters` argument.
3417+
3418+
Provided for reference only, for example so the ingest CLI can advertise the default value in its
3419+
UI. External chunking-related functions (e.g. in ingest or decorators) should use
3420+
`max_characters: int | None = None` and not apply this default themselves. Only
3421+
`ChunkingOptions.max_characters` should apply a default value.
3422+
"""
3423+
3424+
CHUNK_MULTI_PAGE_DEFAULT: bool = True
3425+
"""When False, respect page-boundaries (no two elements from different page in same chunk).
3426+
3427+
Only operative for "by_title" chunking strategy.
3428+
"""
3429+
3430+
BoundaryPredicate: TypeAlias = Callable[[Element], bool]
3431+
"""Detects when element represents crossing a semantic boundary like section or page."""
3432+
3433+
TextAndHtml: TypeAlias = tuple[str, str]
3434+
3435+
# ================================================================================================
3436+
# PRE-CHUNKER
3437+
# ================================================================================================
3438+
3439+
3440+
class PreChunker:
3441+
"""Gathers sequential elements into pre-chunks as length constraints allow.
3442+
3443+
The pre-chunker's responsibilities are:
3444+
3445+
- **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
3446+
either side of those boundaries into different sections. In this case, the primary indicator
3447+
of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
3448+
semantic boundary when `multipage_sections` is `False`.
3449+
3450+
- **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
3451+
into sections as big as possible without exceeding the chunk window size.
3452+
3453+
- **Minimize chunks that must be split mid-text.** Precompute the text length of each section
3454+
and only produce a section that exceeds the chunk window size when there is a single element
3455+
with text longer than that window.
3456+
3457+
A Table element is placed into a section by itself. CheckBox elements are dropped.
3458+
3459+
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
3460+
a new "section", hence the "by-title" designation.
3461+
"""
3462+
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
3463+
self._elements = elements
3464+
self._opts = opts
3465+
3466+
@lazyproperty
3467+
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
3468+
"""The semantic-boundary detectors to be applied to break pre-chunks."""
3469+
return self._opts.boundary_predicates
3470+
3471+
def _is_in_new_semantic_unit(self, element: Element) -> bool:
3472+
"""True when `element` begins a new semantic unit such as a section or page."""
3473+
# Use generator expression for lower memory usage and avoid building intermediate list
3474+
for pred in self._boundary_predicates:
3475+
if pred(element):
3476+
return True
3477+
return False
3478+
3479+
'''
3480+
assert new_code == expected

0 commit comments

Comments
 (0)