@@ -3228,3 +3228,253 @@ def _map_tool_definition(f: ToolDefinition) -> ChatCompletionInputTool:
32283228 assert not re .search (r"^import aiohttp as aiohttp_\b" , new_code , re .MULTILINE ) # conditional alias import: import <name> as <alias>
32293229 assert not re .search (r"^from math import pi as PI, sin as sine\b" , new_code , re .MULTILINE ) # conditional multiple aliases imports
32303230 assert "from huggingface_hub import AsyncInferenceClient, ChatCompletionInputTool" not in new_code # conditional from import
3231+
3232+
3233+ def test_test ():
3234+ root_dir = Path (__file__ ).parent .parent .resolve ()
3235+ main_file = Path (root_dir / "code_to_optimize/temp_main.py" ).resolve ()
3236+
3237+ original_code = '''"""Chunking objects not specific to a particular chunking strategy."""
3238+
3239+ from __future__ import annotations
3240+
3241+ import collections
3242+ import copy
3243+ from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
3244+
3245+ import regex
3246+ from typing_extensions import Self, TypeAlias
3247+
3248+ from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
3249+ from unstructured.documents.elements import (
3250+ CompositeElement,
3251+ ConsolidationStrategy,
3252+ Element,
3253+ ElementMetadata,
3254+ Table,
3255+ TableChunk,
3256+ Title,
3257+ )
3258+ from unstructured.utils import lazyproperty
3259+
3260+ # ================================================================================================
3261+ # MODEL
3262+ # ================================================================================================
3263+
3264+ CHUNK_MAX_CHARS_DEFAULT: int = 500
3265+ """Hard-max chunk-length when no explicit value specified in `max_characters` argument.
3266+
3267+ Provided for reference only, for example so the ingest CLI can advertise the default value in its
3268+ UI. External chunking-related functions (e.g. in ingest or decorators) should use
3269+ `max_characters: int | None = None` and not apply this default themselves. Only
3270+ `ChunkingOptions.max_characters` should apply a default value.
3271+ """
3272+
3273+ CHUNK_MULTI_PAGE_DEFAULT: bool = True
3274+ """When False, respect page-boundaries (no two elements from different page in same chunk).
3275+
3276+ Only operative for "by_title" chunking strategy.
3277+ """
3278+
3279+ BoundaryPredicate: TypeAlias = Callable[[Element], bool]
3280+ """Detects when element represents crossing a semantic boundary like section or page."""
3281+
3282+ TextAndHtml: TypeAlias = tuple[str, str]
3283+
3284+ # ================================================================================================
3285+ # PRE-CHUNKER
3286+ # ================================================================================================
3287+
3288+
3289+ class PreChunker:
3290+ """Gathers sequential elements into pre-chunks as length constraints allow.
3291+
3292+ The pre-chunker's responsibilities are:
3293+
3294+ - **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
3295+ either side of those boundaries into different sections. In this case, the primary indicator
3296+ of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
3297+ semantic boundary when `multipage_sections` is `False`.
3298+
3299+ - **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
3300+ into sections as big as possible without exceeding the chunk window size.
3301+
3302+ - **Minimize chunks that must be split mid-text.** Precompute the text length of each section
3303+ and only produce a section that exceeds the chunk window size when there is a single element
3304+ with text longer than that window.
3305+
3306+ A Table element is placed into a section by itself. CheckBox elements are dropped.
3307+
3308+ The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
3309+ a new "section", hence the "by-title" designation.
3310+ """
3311+
3312+ def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
3313+ self._elements = elements
3314+ self._opts = opts
3315+
3316+ @lazyproperty
3317+ def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
3318+ """The semantic-boundary detectors to be applied to break pre-chunks."""
3319+ return self._opts.boundary_predicates
3320+
3321+ def _is_in_new_semantic_unit(self, element: Element) -> bool:
3322+ """True when `element` begins a new semantic unit such as a section or page."""
3323+ # -- all detectors need to be called to update state and avoid double counting
3324+ # -- boundaries that happen to coincide, like Table and new section on same element.
3325+ # -- Using `any()` would short-circuit on first True.
3326+ semantic_boundaries = [pred(element) for pred in self._boundary_predicates]
3327+ return any(semantic_boundaries)
3328+
3329+ '''
3330+ main_file .write_text (original_code , encoding = "utf-8" )
3331+ optim_code = f'''```python:{ main_file .relative_to (root_dir )}
3332+ # ================================================================================================
3333+ # PRE-CHUNKER
3334+ # ================================================================================================
3335+
3336+ from __future__ import annotations
3337+ from typing import Iterable
3338+ from unstructured.documents.elements import Element
3339+ from unstructured.utils import lazyproperty
3340+
3341+ class PreChunker:
3342+ def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
3343+ self._elements = elements
3344+ self._opts = opts
3345+
3346+ @lazyproperty
3347+ def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
3348+ """The semantic-boundary detectors to be applied to break pre-chunks."""
3349+ return self._opts.boundary_predicates
3350+
3351+ def _is_in_new_semantic_unit(self, element: Element) -> bool:
3352+ """True when `element` begins a new semantic unit such as a section or page."""
3353+ # Use generator expression for lower memory usage and avoid building intermediate list
3354+ for pred in self._boundary_predicates:
3355+ if pred(element):
3356+ return True
3357+ return False
3358+ ```
3359+ '''
3360+
3361+ func = FunctionToOptimize (function_name = "_is_in_new_semantic_unit" , parents = [FunctionParent ("PreChunker" , "ClassDef" )], file_path = main_file )
3362+ test_config = TestConfig (
3363+ tests_root = root_dir / "tests/pytest" ,
3364+ tests_project_rootdir = root_dir ,
3365+ project_root_path = root_dir ,
3366+ test_framework = "pytest" ,
3367+ pytest_cmd = "pytest" ,
3368+ )
3369+ func_optimizer = FunctionOptimizer (function_to_optimize = func , test_cfg = test_config )
3370+ code_context : CodeOptimizationContext = func_optimizer .get_code_optimization_context ().unwrap ()
3371+
3372+ original_helper_code : dict [Path , str ] = {}
3373+ helper_function_paths = {hf .file_path for hf in code_context .helper_functions }
3374+ for helper_function_path in helper_function_paths :
3375+ with helper_function_path .open (encoding = "utf8" ) as f :
3376+ helper_code = f .read ()
3377+ original_helper_code [helper_function_path ] = helper_code
3378+
3379+ func_optimizer .args = Args ()
3380+ func_optimizer .replace_function_and_helpers_with_optimized_code (
3381+ code_context = code_context , optimized_code = CodeStringsMarkdown .parse_markdown_code (optim_code ), original_helper_code = original_helper_code
3382+ )
3383+
3384+
3385+ new_code = main_file .read_text (encoding = "utf-8" )
3386+ main_file .unlink (missing_ok = True )
3387+
3388+ expected = '''"""Chunking objects not specific to a particular chunking strategy."""
3389+
3390+ from __future__ import annotations
3391+
3392+ import collections
3393+ import copy
3394+ from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
3395+
3396+ import regex
3397+ from typing_extensions import Self, TypeAlias
3398+
3399+ from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
3400+ from unstructured.documents.elements import (
3401+ CompositeElement,
3402+ ConsolidationStrategy,
3403+ Element,
3404+ ElementMetadata,
3405+ Table,
3406+ TableChunk,
3407+ Title,
3408+ )
3409+ from unstructured.utils import lazyproperty
3410+
3411+ # ================================================================================================
3412+ # MODEL
3413+ # ================================================================================================
3414+
3415+ CHUNK_MAX_CHARS_DEFAULT: int = 500
3416+ """Hard-max chunk-length when no explicit value specified in `max_characters` argument.
3417+
3418+ Provided for reference only, for example so the ingest CLI can advertise the default value in its
3419+ UI. External chunking-related functions (e.g. in ingest or decorators) should use
3420+ `max_characters: int | None = None` and not apply this default themselves. Only
3421+ `ChunkingOptions.max_characters` should apply a default value.
3422+ """
3423+
3424+ CHUNK_MULTI_PAGE_DEFAULT: bool = True
3425+ """When False, respect page-boundaries (no two elements from different page in same chunk).
3426+
3427+ Only operative for "by_title" chunking strategy.
3428+ """
3429+
3430+ BoundaryPredicate: TypeAlias = Callable[[Element], bool]
3431+ """Detects when element represents crossing a semantic boundary like section or page."""
3432+
3433+ TextAndHtml: TypeAlias = tuple[str, str]
3434+
3435+ # ================================================================================================
3436+ # PRE-CHUNKER
3437+ # ================================================================================================
3438+
3439+
3440+ class PreChunker:
3441+ """Gathers sequential elements into pre-chunks as length constraints allow.
3442+
3443+ The pre-chunker's responsibilities are:
3444+
3445+ - **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
3446+ either side of those boundaries into different sections. In this case, the primary indicator
3447+ of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
3448+ semantic boundary when `multipage_sections` is `False`.
3449+
3450+ - **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
3451+ into sections as big as possible without exceeding the chunk window size.
3452+
3453+ - **Minimize chunks that must be split mid-text.** Precompute the text length of each section
3454+ and only produce a section that exceeds the chunk window size when there is a single element
3455+ with text longer than that window.
3456+
3457+ A Table element is placed into a section by itself. CheckBox elements are dropped.
3458+
3459+ The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
3460+ a new "section", hence the "by-title" designation.
3461+ """
3462+ def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
3463+ self._elements = elements
3464+ self._opts = opts
3465+
3466+ @lazyproperty
3467+ def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
3468+ """The semantic-boundary detectors to be applied to break pre-chunks."""
3469+ return self._opts.boundary_predicates
3470+
3471+ def _is_in_new_semantic_unit(self, element: Element) -> bool:
3472+ """True when `element` begins a new semantic unit such as a section or page."""
3473+ # Use generator expression for lower memory usage and avoid building intermediate list
3474+ for pred in self._boundary_predicates:
3475+ if pred(element):
3476+ return True
3477+ return False
3478+
3479+ '''
3480+ assert new_code == expected
0 commit comments