Skip to content

Commit 05ff975

Browse files
authored
fix: remove unused ElementMetadata.section (#2921)
**Summary** The `.section` field in `ElementMetadata` is dead code, possibly a remainder from a prior iteration of `partition_epub()`. In any case, it is not populated by any partitioner. Remove it and any code that uses it.
1 parent 305247b commit 05ff975

File tree

11 files changed

+21
-170
lines changed

11 files changed

+21
-170
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.13.4-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
9+
* **Remove ElementMetadata.section field.**. This field was unused, not populated by any partitioners.
10+
111
## 0.13.3
212

313
### Enhancements

docs/source/core/chunking.rst

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,6 @@ following behaviors:
152152
``Title`` element would fit in the prior chunk. This implements the first aspect of the "preserve
153153
section boundaries" contract.
154154

155-
- **Detect metadata.section change.** An element with a new value in ``element.metadata.section`` is
156-
considered to start a new section. When a change in this value is encountered a new chunk is
157-
started. This implements the second aspect of preserving section boundaries. This metadata is not
158-
present in all document formats so is not used alone. An element having ``None`` for this metadata
159-
field is considered to be part of the prior section; a section break is only detected on an
160-
explicit change in value.
161-
162155
- **Respect page boundaries.** Page boundaries can optionally also be respected using the
163156
``multipage_sections`` argument. This defaults to ``True`` meaning that a page break does *not*
164157
start a new chunk. Setting this to ``False`` will separate elements that occur on different pages

test_unstructured/chunking/test_base.py

Lines changed: 0 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
TextPreChunk,
1818
TextPreChunkAccumulator,
1919
_TextSplitter,
20-
is_in_next_section,
2120
is_on_next_page,
2221
is_title,
2322
)
@@ -1514,68 +1513,6 @@ def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
15141513
# ================================================================================================
15151514

15161515

1517-
class Describe_is_in_next_section:
1518-
"""Unit-test suite for `unstructured.chunking.base.is_in_next_section()` function.
1519-
1520-
`is_in_next_section()` is not itself a predicate, rather it returns a predicate on Element
1521-
(`Callable[[Element], bool]`) that can be called repeatedly to detect section changes in an
1522-
element stream.
1523-
"""
1524-
1525-
def it_is_false_for_the_first_element_when_it_has_a_non_None_section(self):
1526-
"""This is an explicit first-section; first-section does not represent a section break."""
1527-
pred = is_in_next_section()
1528-
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
1529-
1530-
def and_it_is_false_for_the_first_element_when_it_has_a_None_section(self):
1531-
"""This is an anonymous first-section; still doesn't represent a section break."""
1532-
pred = is_in_next_section()
1533-
assert not pred(Text("abcd"))
1534-
1535-
def it_is_false_for_None_section_elements_that_follow_an_explicit_first_section(self):
1536-
"""A `None` section element is considered to continue the prior section."""
1537-
pred = is_in_next_section()
1538-
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
1539-
assert not pred(Text("efgh"))
1540-
assert not pred(Text("ijkl"))
1541-
1542-
def and_it_is_false_for_None_section_elements_that_follow_an_anonymous_first_section(self):
1543-
"""A `None` section element is considered to continue the prior section."""
1544-
pred = is_in_next_section()
1545-
assert not pred(Text("abcd"))
1546-
assert not pred(Text("efgh"))
1547-
assert not pred(Text("ijkl"))
1548-
1549-
def it_is_false_for_matching_section_elements_that_follow_an_explicit_first_section(self):
1550-
pred = is_in_next_section()
1551-
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
1552-
assert not pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
1553-
assert not pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))
1554-
1555-
def it_is_true_for_an_explicit_section_element_that_follows_an_anonymous_first_section(self):
1556-
pred = is_in_next_section()
1557-
assert not pred(Text("abcd"))
1558-
assert not pred(Text("efgh"))
1559-
assert pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))
1560-
1561-
def and_it_is_true_for_a_different_explicit_section_that_follows_an_explicit_section(self):
1562-
pred = is_in_next_section()
1563-
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
1564-
assert pred(Text("efgh", metadata=ElementMetadata(section="Summary")))
1565-
1566-
def it_is_true_whenever_the_section_explicitly_changes_except_at_the_start(self):
1567-
pred = is_in_next_section()
1568-
assert not pred(Text("abcd"))
1569-
assert pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
1570-
assert not pred(Text("ijkl"))
1571-
assert not pred(Text("mnop", metadata=ElementMetadata(section="Introduction")))
1572-
assert not pred(Text("qrst"))
1573-
assert pred(Text("uvwx", metadata=ElementMetadata(section="Summary")))
1574-
assert not pred(Text("yzab", metadata=ElementMetadata(section="Summary")))
1575-
assert not pred(Text("cdef"))
1576-
assert pred(Text("ghij", metadata=ElementMetadata(section="Appendix")))
1577-
1578-
15791516
class Describe_is_on_next_page:
15801517
"""Unit-test suite for `unstructured.chunking.base.is_on_next_page()` function.
15811518

test_unstructured/chunking/test_title.py

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -139,43 +139,6 @@ def test_chunk_by_title():
139139
)
140140

141141

142-
def test_chunk_by_title_respects_section_change():
143-
elements: list[Element] = [
144-
Title("A Great Day", metadata=ElementMetadata(section="first")),
145-
Text("Today is a great day.", metadata=ElementMetadata(section="second")),
146-
Text("It is sunny outside.", metadata=ElementMetadata(section="second")),
147-
Table("Heading\nCell text"),
148-
Title("An Okay Day"),
149-
Text("Today is an okay day."),
150-
Text("It is rainy outside."),
151-
Title("A Bad Day"),
152-
Text(
153-
"Today is a bad day.",
154-
metadata=ElementMetadata(
155-
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
156-
),
157-
),
158-
Text("It is storming outside."),
159-
CheckBox(),
160-
]
161-
162-
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
163-
164-
assert chunks == [
165-
CompositeElement(
166-
"A Great Day",
167-
),
168-
CompositeElement(
169-
"Today is a great day.\n\nIt is sunny outside.",
170-
),
171-
Table("Heading\nCell text"),
172-
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
173-
CompositeElement(
174-
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
175-
),
176-
]
177-
178-
179142
def test_chunk_by_title_separates_by_page_number():
180143
elements: list[Element] = [
181144
Title("A Great Day", metadata=ElementMetadata(page_number=1)),

test_unstructured/partition/epub/test_epub.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ def test_partition_epub_from_filename_exclude_metadata():
7777
assert elements[0].metadata.filetype is None
7878
assert elements[0].metadata.page_name is None
7979
assert elements[0].metadata.filename is None
80-
assert elements[0].metadata.section is None
8180

8281

8382
def test_partition_epub_from_file_exlcude_metadata():
@@ -87,7 +86,6 @@ def test_partition_epub_from_file_exlcude_metadata():
8786
assert elements[0].metadata.filetype is None
8887
assert elements[0].metadata.page_name is None
8988
assert elements[0].metadata.filename is None
90-
assert elements[0].metadata.section is None
9189

9290

9391
def test_partition_epub_metadata_date(

test_unstructured/staging/test_base.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ def test_default_pandas_dtypes():
166166
sent_from=["sent", "from"],
167167
sent_to=["sent", "to"],
168168
subject="subject",
169-
section="section",
170169
header_footer_type="header_footer_type",
171170
emphasized_text_contents=["emphasized", "text", "contents"],
172171
emphasized_text_tags=["emphasized", "text", "tags"],
@@ -321,7 +320,6 @@ def test_convert_to_coco():
321320
sent_from=["sent", "from"],
322321
sent_to=["sent", "to"],
323322
subject="subject",
324-
section="section",
325323
header_footer_type="header_footer_type",
326324
emphasized_text_contents=["emphasized", "text", "contents"],
327325
emphasized_text_tags=["emphasized", "text", "tags"],
@@ -366,7 +364,6 @@ def test_convert_to_coco():
366364
sent_from=["sent", "from"],
367365
sent_to=["sent", "to"],
368366
subject="subject",
369-
section="section",
370367
header_footer_type="header_footer_type",
371368
emphasized_text_contents=["emphasized", "text", "contents"],
372369
emphasized_text_tags=["emphasized", "text", "tags"],

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.13.3" # pragma: no cover
1+
__version__ = "0.13.4-dev0" # pragma: no cover

unstructured/chunking/base.py

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,51 +1022,6 @@ def will_fit(self, pre_chunk: TextPreChunk) -> bool:
10221022
# ================================================================================================
10231023

10241024

1025-
def is_in_next_section() -> BoundaryPredicate:
1026-
"""Not a predicate itself, calling this returns a predicate that triggers on each new section.
1027-
1028-
The lifetime of the returned callable cannot extend beyond a single element-stream because it
1029-
stores current state (current section) that is particular to that element stream.
1030-
1031-
A "section" of this type is particular to the EPUB format (so far) and not to be confused with
1032-
a "section" composed of a section-heading (`Title` element) followed by content elements.
1033-
1034-
The returned predicate tracks the current section, starting at `None`. Calling with an element
1035-
with a different value for `metadata.section` returns True, indicating the element starts a new
1036-
section boundary, and updates the enclosed section name ready for the next transition.
1037-
"""
1038-
current_section: Optional[str] = None
1039-
is_first: bool = True
1040-
1041-
def section_changed(element: Element) -> bool:
1042-
nonlocal current_section, is_first
1043-
1044-
section = element.metadata.section
1045-
1046-
# -- The first element never reports a section break, it starts the first section of the
1047-
# -- document. That section could be named (section is non-None) or anonymous (section is
1048-
# -- None). We don't really have to care.
1049-
if is_first:
1050-
current_section = section
1051-
is_first = False
1052-
return False
1053-
1054-
# -- An element with a `None` section is assumed to continue the current section. It never
1055-
# -- updates the current-section because once set, the current-section is "sticky" until
1056-
# -- replaced by another explicit section.
1057-
if section is None:
1058-
return False
1059-
1060-
# -- another element with the same section continues that section --
1061-
if section == current_section:
1062-
return False
1063-
1064-
current_section = section
1065-
return True
1066-
1067-
return section_changed
1068-
1069-
10701025
def is_on_next_page() -> BoundaryPredicate:
10711026
"""Not a predicate itself, calling this returns a predicate that triggers on each new page.
10721027

unstructured/chunking/title.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
ChunkingOptions,
1414
PreChunkCombiner,
1515
PreChunker,
16-
is_in_next_section,
1716
is_on_next_page,
1817
is_title,
1918
)
@@ -121,7 +120,6 @@ def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
121120

122121
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
123122
yield is_title
124-
yield is_in_next_section()
125123
if not self.multipage_sections:
126124
yield is_on_next_page()
127125

unstructured/documents/elements.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,6 @@ class ElementMetadata:
191191
parent_id: Optional[str]
192192
# -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
193193
regex_metadata: Optional[dict[str, list[RegexMetadata]]]
194-
# -- EPUB document section --
195-
section: Optional[str]
196194

197195
# -- e-mail specific metadata fields --
198196
sent_from: Optional[list[str]]
@@ -235,7 +233,6 @@ def __init__(
235233
page_number: Optional[int] = None,
236234
parent_id: Optional[str] = None,
237235
regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None,
238-
section: Optional[str] = None,
239236
sent_from: Optional[list[str]] = None,
240237
sent_to: Optional[list[str]] = None,
241238
signature: Optional[str] = None,
@@ -275,7 +272,6 @@ def __init__(
275272
self.page_number = page_number
276273
self.parent_id = parent_id
277274
self.regex_metadata = regex_metadata
278-
self.section = section
279275
self.sent_from = sent_from
280276
self.sent_to = sent_to
281277
self.signature = signature
@@ -488,7 +484,6 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
488484
"page_number": cls.FIRST,
489485
"parent_id": cls.DROP,
490486
"regex_metadata": cls.REGEX,
491-
"section": cls.FIRST,
492487
"sent_from": cls.FIRST,
493488
"sent_to": cls.FIRST,
494489
"signature": cls.FIRST,
@@ -671,7 +666,7 @@ def to_dict(cls):
671666

672667

673668
class Element(abc.ABC):
674-
"""An element is a section of a page in the document.
669+
"""An element is a semantically-coherent component of a document, often a paragraph.
675670
676671
There are a few design principles that are followed when creating an element:
677672
1. It will always have an ID, which by default is a random UUID.
@@ -694,7 +689,9 @@ def __init__(
694689
metadata: Optional[ElementMetadata] = None,
695690
detection_origin: Optional[str] = None,
696691
):
697-
if element_id is not None and not isinstance(element_id, str):
692+
if element_id is not None and not isinstance(
693+
element_id, str
694+
): # pyright: ignore[reportUnnecessaryIsInstance]
698695
raise ValueError("element_id must be of type str or None.")
699696

700697
self._element_id = element_id
@@ -885,7 +882,12 @@ class Formula(Text):
885882

886883

887884
class CompositeElement(Text):
888-
"""A section of text consisting of a combination of elements."""
885+
"""A chunk formed from text (non-Table) elements.
886+
887+
Only produced by chunking. An instance may be formed by combining one or more sequential
888+
elements produced by partitioning. It it also used when text-splitting an "oversized" element,
889+
a single element that by itself is larger than the requested chunk size.
890+
"""
889891

890892
category = "CompositeElement"
891893

0 commit comments

Comments
 (0)