fix: remove unused ElementMetadata.section (#2921)

scanny · web-flow · commit 05ff9750813b · 2024-04-22T23:58:17.000Z
**Summary**
The `.section` field in `ElementMetadata` is dead code, possibly a
remainder from a prior iteration of `partition_epub()`. In any case, it
is not populated by any partitioner. Remove it and any code that uses
it.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.13.4-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
+* **Remove ElementMetadata.section field.**. This field was unused, not populated by any partitioners.
+
 ## 0.13.3
 
 ### Enhancements
diff --git a/docs/source/core/chunking.rst b/docs/source/core/chunking.rst
@@ -152,13 +152,6 @@ following behaviors:
   ``Title`` element would fit in the prior chunk. This implements the first aspect of the "preserve
   section boundaries" contract.
 
-- **Detect metadata.section change.** An element with a new value in ``element.metadata.section`` is
-  considered to start a new section. When a change in this value is encountered a new chunk is
-  started. This implements the second aspect of preserving section boundaries. This metadata is not
-  present in all document formats so is not used alone. An element having ``None`` for this metadata
-  field is considered to be part of the prior section; a section break is only detected on an
-  explicit change in value.
-
 - **Respect page boundaries.** Page boundaries can optionally also be respected using the
   ``multipage_sections`` argument. This defaults to ``True`` meaning that a page break does *not*
   start a new chunk. Setting this to ``False`` will separate elements that occur on different pages
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
@@ -17,7 +17,6 @@
     TextPreChunk,
     TextPreChunkAccumulator,
     _TextSplitter,
-    is_in_next_section,
     is_on_next_page,
     is_title,
 )
@@ -1514,68 +1513,6 @@ def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
 # ================================================================================================
 
 
-class Describe_is_in_next_section:
-    """Unit-test suite for `unstructured.chunking.base.is_in_next_section()` function.
-
-    `is_in_next_section()` is not itself a predicate, rather it returns a predicate on Element
-    (`Callable[[Element], bool]`) that can be called repeatedly to detect section changes in an
-    element stream.
-    """
-
-    def it_is_false_for_the_first_element_when_it_has_a_non_None_section(self):
-        """This is an explicit first-section; first-section does not represent a section break."""
-        pred = is_in_next_section()
-        assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
-
-    def and_it_is_false_for_the_first_element_when_it_has_a_None_section(self):
-        """This is an anonymous first-section; still doesn't represent a section break."""
-        pred = is_in_next_section()
-        assert not pred(Text("abcd"))
-
-    def it_is_false_for_None_section_elements_that_follow_an_explicit_first_section(self):
-        """A `None` section element is considered to continue the prior section."""
-        pred = is_in_next_section()
-        assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
-        assert not pred(Text("efgh"))
-        assert not pred(Text("ijkl"))
-
-    def and_it_is_false_for_None_section_elements_that_follow_an_anonymous_first_section(self):
-        """A `None` section element is considered to continue the prior section."""
-        pred = is_in_next_section()
-        assert not pred(Text("abcd"))
-        assert not pred(Text("efgh"))
-        assert not pred(Text("ijkl"))
-
-    def it_is_false_for_matching_section_elements_that_follow_an_explicit_first_section(self):
-        pred = is_in_next_section()
-        assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
-        assert not pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
-        assert not pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))
-
-    def it_is_true_for_an_explicit_section_element_that_follows_an_anonymous_first_section(self):
-        pred = is_in_next_section()
-        assert not pred(Text("abcd"))
-        assert not pred(Text("efgh"))
-        assert pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))
-
-    def and_it_is_true_for_a_different_explicit_section_that_follows_an_explicit_section(self):
-        pred = is_in_next_section()
-        assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
-        assert pred(Text("efgh", metadata=ElementMetadata(section="Summary")))
-
-    def it_is_true_whenever_the_section_explicitly_changes_except_at_the_start(self):
-        pred = is_in_next_section()
-        assert not pred(Text("abcd"))
-        assert pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
-        assert not pred(Text("ijkl"))
-        assert not pred(Text("mnop", metadata=ElementMetadata(section="Introduction")))
-        assert not pred(Text("qrst"))
-        assert pred(Text("uvwx", metadata=ElementMetadata(section="Summary")))
-        assert not pred(Text("yzab", metadata=ElementMetadata(section="Summary")))
-        assert not pred(Text("cdef"))
-        assert pred(Text("ghij", metadata=ElementMetadata(section="Appendix")))
-
-
 class Describe_is_on_next_page:
     """Unit-test suite for `unstructured.chunking.base.is_on_next_page()` function.
 
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
@@ -139,43 +139,6 @@ def test_chunk_by_title():
     )
 
 
-def test_chunk_by_title_respects_section_change():
-    elements: list[Element] = [
-        Title("A Great Day", metadata=ElementMetadata(section="first")),
-        Text("Today is a great day.", metadata=ElementMetadata(section="second")),
-        Text("It is sunny outside.", metadata=ElementMetadata(section="second")),
-        Table("Heading\nCell text"),
-        Title("An Okay Day"),
-        Text("Today is an okay day."),
-        Text("It is rainy outside."),
-        Title("A Bad Day"),
-        Text(
-            "Today is a bad day.",
-            metadata=ElementMetadata(
-                regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
-            ),
-        ),
-        Text("It is storming outside."),
-        CheckBox(),
-    ]
-
-    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
-
-    assert chunks == [
-        CompositeElement(
-            "A Great Day",
-        ),
-        CompositeElement(
-            "Today is a great day.\n\nIt is sunny outside.",
-        ),
-        Table("Heading\nCell text"),
-        CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
-        CompositeElement(
-            "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
-        ),
-    ]
-
-
 def test_chunk_by_title_separates_by_page_number():
     elements: list[Element] = [
         Title("A Great Day", metadata=ElementMetadata(page_number=1)),
diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py
@@ -77,7 +77,6 @@ def test_partition_epub_from_filename_exclude_metadata():
     assert elements[0].metadata.filetype is None
     assert elements[0].metadata.page_name is None
     assert elements[0].metadata.filename is None
-    assert elements[0].metadata.section is None
 
 
 def test_partition_epub_from_file_exlcude_metadata():
@@ -87,7 +86,6 @@ def test_partition_epub_from_file_exlcude_metadata():
     assert elements[0].metadata.filetype is None
     assert elements[0].metadata.page_name is None
     assert elements[0].metadata.filename is None
-    assert elements[0].metadata.section is None
 
 
 def test_partition_epub_metadata_date(
diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py
@@ -166,7 +166,6 @@ def test_default_pandas_dtypes():
             sent_from=["sent", "from"],
             sent_to=["sent", "to"],
             subject="subject",
-            section="section",
             header_footer_type="header_footer_type",
             emphasized_text_contents=["emphasized", "text", "contents"],
             emphasized_text_tags=["emphasized", "text", "tags"],
@@ -321,7 +320,6 @@ def test_convert_to_coco():
                 sent_from=["sent", "from"],
                 sent_to=["sent", "to"],
                 subject="subject",
-                section="section",
                 header_footer_type="header_footer_type",
                 emphasized_text_contents=["emphasized", "text", "contents"],
                 emphasized_text_tags=["emphasized", "text", "tags"],
@@ -366,7 +364,6 @@ def test_convert_to_coco():
                 sent_from=["sent", "from"],
                 sent_to=["sent", "to"],
                 subject="subject",
-                section="section",
                 header_footer_type="header_footer_type",
                 emphasized_text_contents=["emphasized", "text", "contents"],
                 emphasized_text_tags=["emphasized", "text", "tags"],
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.3"  # pragma: no cover
+__version__ = "0.13.4-dev0"  # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
@@ -1022,51 +1022,6 @@ def will_fit(self, pre_chunk: TextPreChunk) -> bool:
 # ================================================================================================
 
 
-def is_in_next_section() -> BoundaryPredicate:
-    """Not a predicate itself, calling this returns a predicate that triggers on each new section.
-
-    The lifetime of the returned callable cannot extend beyond a single element-stream because it
-    stores current state (current section) that is particular to that element stream.
-
-    A "section" of this type is particular to the EPUB format (so far) and not to be confused with
-    a "section" composed of a section-heading (`Title` element) followed by content elements.
-
-    The returned predicate tracks the current section, starting at `None`. Calling with an element
-    with a different value for `metadata.section` returns True, indicating the element starts a new
-    section boundary, and updates the enclosed section name ready for the next transition.
-    """
-    current_section: Optional[str] = None
-    is_first: bool = True
-
-    def section_changed(element: Element) -> bool:
-        nonlocal current_section, is_first
-
-        section = element.metadata.section
-
-        # -- The first element never reports a section break, it starts the first section of the
-        # -- document. That section could be named (section is non-None) or anonymous (section is
-        # -- None). We don't really have to care.
-        if is_first:
-            current_section = section
-            is_first = False
-            return False
-
-        # -- An element with a `None` section is assumed to continue the current section. It never
-        # -- updates the current-section because once set, the current-section is "sticky" until
-        # -- replaced by another explicit section.
-        if section is None:
-            return False
-
-        # -- another element with the same section continues that section --
-        if section == current_section:
-            return False
-
-        current_section = section
-        return True
-
-    return section_changed
-
-
 def is_on_next_page() -> BoundaryPredicate:
     """Not a predicate itself, calling this returns a predicate that triggers on each new page.
 
diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py
@@ -13,7 +13,6 @@
     ChunkingOptions,
     PreChunkCombiner,
     PreChunker,
-    is_in_next_section,
     is_on_next_page,
     is_title,
 )
@@ -121,7 +120,6 @@ def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
 
         def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
             yield is_title
-            yield is_in_next_section()
             if not self.multipage_sections:
                 yield is_on_next_page()
 
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
@@ -191,8 +191,6 @@ class ElementMetadata:
     parent_id: Optional[str]
     # -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
     regex_metadata: Optional[dict[str, list[RegexMetadata]]]
-    # -- EPUB document section --
-    section: Optional[str]
 
     # -- e-mail specific metadata fields --
     sent_from: Optional[list[str]]
@@ -235,7 +233,6 @@ def __init__(
         page_number: Optional[int] = None,
         parent_id: Optional[str] = None,
         regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None,
-        section: Optional[str] = None,
         sent_from: Optional[list[str]] = None,
         sent_to: Optional[list[str]] = None,
         signature: Optional[str] = None,
@@ -275,7 +272,6 @@ def __init__(
         self.page_number = page_number
         self.parent_id = parent_id
         self.regex_metadata = regex_metadata
-        self.section = section
         self.sent_from = sent_from
         self.sent_to = sent_to
         self.signature = signature
@@ -488,7 +484,6 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
             "page_number": cls.FIRST,
             "parent_id": cls.DROP,
             "regex_metadata": cls.REGEX,
-            "section": cls.FIRST,
             "sent_from": cls.FIRST,
             "sent_to": cls.FIRST,
             "signature": cls.FIRST,
@@ -671,7 +666,7 @@ def to_dict(cls):
 
 
 class Element(abc.ABC):
-    """An element is a section of a page in the document.
+    """An element is a semantically-coherent component of a document, often a paragraph.
 
     There are a few design principles that are followed when creating an element:
     1. It will always have an ID, which by default is a random UUID.
@@ -694,7 +689,9 @@ def __init__(
         metadata: Optional[ElementMetadata] = None,
         detection_origin: Optional[str] = None,
     ):
-        if element_id is not None and not isinstance(element_id, str):
+        if element_id is not None and not isinstance(
+            element_id, str
+        ):  # pyright: ignore[reportUnnecessaryIsInstance]
             raise ValueError("element_id must be of type str or None.")
 
         self._element_id = element_id
@@ -885,7 +882,12 @@ class Formula(Text):
 
 
 class CompositeElement(Text):
-    """A section of text consisting of a combination of elements."""
+    """A chunk formed from text (non-Table) elements.
+
+    Only produced by chunking. An instance may be formed by combining one or more sequential
+    elements produced by partitioning. It it also used when text-splitting an "oversized" element,
+    a single element that by itself is larger than the requested chunk size.
+    """
 
     category = "CompositeElement"
 
diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
@@ -272,7 +272,6 @@ def add_element_metadata(
     text_as_html: Optional[str] = None,
     coordinates: Optional[tuple[tuple[float, float], ...]] = None,
     coordinate_system: Optional[CoordinateSystem] = None,
-    section: Optional[str] = None,
     image_path: Optional[str] = None,
     detection_origin: Optional[str] = None,
     languages: Optional[List[str]] = None,
@@ -324,7 +323,6 @@ def add_element_metadata(
         link_start_indexes=link_start_indexes,
         emphasized_text_contents=emphasized_text_contents,
         emphasized_text_tags=emphasized_text_tags,
-        section=section,
         category_depth=depth,
         image_path=image_path,
         languages=languages,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.13.3" # pragma: no cover`
	`1`	`+__version__ = "0.13.4-dev0" # pragma: no cover`