Skip to content

Commit 9ac4445

Browse files
authored
refactor title.py (#2657)
Minor refactor after conversation with @scanny Updates docstring and how chunking options are accessed. `self._kwargs.get()` should only be used in the `lazyproperty` definition of an instance's attribute. Other calls should use `self.<attribute>`
1 parent 2eb0b25 commit 9ac4445

File tree

4 files changed

+27
-37
lines changed

4 files changed

+27
-37
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.12.7-dev6
1+
## 0.12.7-dev7
22

33
### Enhancements
44

test_unstructured/chunking/test_title.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -642,19 +642,13 @@ def it_rejects_combine_text_under_n_chars_greater_than_maxchars(
642642
)
643643

644644
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
645-
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
646-
647-
In particular, `combine_text_under_n_chars` value is adjusted down to the
648-
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
649-
value of `new_after_n_chars`.
650-
"""
645+
"""Caller can specify `new_after_n_chars` arg without specifying any other options."""
651646
try:
652-
opts = _ByTitleChunkingOptions(new_after_n_chars=200)
647+
opts = _ByTitleChunkingOptions.new(new_after_n_chars=200)
653648
except ValueError:
654649
pytest.fail("did not accept `new_after_n_chars` as option by itself")
655650

656651
assert opts.soft_max == 200
657-
assert opts.combine_text_under_n_chars == 200
658652

659653
@pytest.mark.parametrize(
660654
("multipage_sections", "expected_value"),

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.12.7-dev6" # pragma: no cover
1+
__version__ = "0.12.7-dev7" # pragma: no cover

unstructured/chunking/title.py

Lines changed: 23 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ class _ByTitleChunkingOptions(ChunkingOptions):
101101
102102
`by_title`-specific options:
103103
104+
combine_text_under_n_chars
105+
A remedy to over-chunking caused by elements mis-identified as Title elements.
106+
Every Title element would start a new chunk and this setting mitigates that, at the
107+
expense of sometimes violating legitimate semantic boundaries.
104108
multipage_sections
105109
Indicates that page-boundaries should not be respected while chunking, i.e. elements
106110
appearing on two different pages can appear in the same chunk.
@@ -133,15 +137,9 @@ def combine_text_under_n_chars(self) -> int:
133137
- Defaults to `max_characters` when not specified.
134138
- Is reduced to `new_after_n_chars` when it exceeds that value.
135139
"""
136-
max_characters = self.hard_max
137-
soft_max = self.soft_max
138-
arg_value = self._kwargs.get("combine_text_under_n_chars")
139-
140140
# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified --
141-
combine_text_under_n_chars = max_characters if arg_value is None else arg_value
142-
143-
# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
144-
return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
141+
arg_value = self._kwargs.get("combine_text_under_n_chars")
142+
return self.hard_max if arg_value is None else arg_value
145143

146144
@lazyproperty
147145
def multipage_sections(self) -> bool:
@@ -154,22 +152,20 @@ def _validate(self) -> None:
154152
# -- start with base-class validations --
155153
super()._validate()
156154

157-
combine_text_under_n_chars_arg = self._kwargs.get("combine_text_under_n_chars")
158-
if combine_text_under_n_chars_arg is not None:
159-
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
160-
# -- but a negative value is not
161-
if combine_text_under_n_chars_arg < 0:
162-
raise ValueError(
163-
f"'combine_text_under_n_chars' argument must be >= 0,"
164-
f" got {combine_text_under_n_chars_arg}"
165-
)
166-
167-
# -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to
168-
# -- users. The chunking behavior would be no different than when
169-
# -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to
170-
# -- default (500) then it can look like chunk-combining isn't working.
171-
if combine_text_under_n_chars_arg > self.hard_max:
172-
raise ValueError(
173-
f"'combine_text_under_n_chars' argument must not exceed `max_characters`"
174-
f" value, got {combine_text_under_n_chars_arg} > {self.hard_max}"
175-
)
155+
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
156+
# -- but a negative value is not
157+
if self.combine_text_under_n_chars < 0:
158+
raise ValueError(
159+
f"'combine_text_under_n_chars' argument must be >= 0,"
160+
f" got {self.combine_text_under_n_chars}"
161+
)
162+
163+
# -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to
164+
# -- users. The chunking behavior would be no different than when
165+
# -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to
166+
# -- default (500) then it can look like chunk-combining isn't working.
167+
if self.combine_text_under_n_chars > self.hard_max:
168+
raise ValueError(
169+
f"'combine_text_under_n_chars' argument must not exceed `max_characters`"
170+
f" value, got {self.combine_text_under_n_chars} > {self.hard_max}"
171+
)

0 commit comments

Comments
 (0)