Skip to content

Commit 2f5c61c

Browse files
author
Umar Farooqi
authored
fix: exclude empty tags during depth check (#379)
1 parent 19fb303 commit 2f5c61c

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

unstructured/documents/html.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"]
3636
PAGEBREAK_TAGS: Final[List[str]] = ["hr"]
3737
HEADER_OR_FOOTER_TAGS: Final[List[str]] = ["header", "footer"]
38+
EMPTY_TAGS: Final[List[str]] = ["br", "hr"]
3839

3940

4041
class TagsMixin:
@@ -154,7 +155,6 @@ def doc_after_cleaners(
154155
"""Filters the elements and returns a new instance of the class based on the criteria
155156
specified. Note that the number of pages can change in the case that all elements on a
156157
page are filtered out.
157-
158158
Parameters
159159
----------
160160
skip_table_text:
@@ -283,7 +283,9 @@ def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
283283
"""Deteremines if a tag potentially contains narrative text."""
284284
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
285285
# it could be the text representation of a giant div
286-
if len(tag_elem) > max_predecessor_len:
286+
# Exclude empty tags from tag_elem
287+
empty_elems_len = len([el for el in tag_elem.getchildren() if el.tag in EMPTY_TAGS])
288+
if len(tag_elem) > max_predecessor_len + empty_elems_len:
287289
return False
288290

289291
if tag_elem.tag in TEXT_TAGS + HEADING_TAGS:
@@ -320,7 +322,8 @@ def _process_list_item(
320322
next_text = _construct_text(next_element)
321323
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
322324
# it could be the text representation of a giant div
323-
if len(tag_elem) > max_predecessor_len:
325+
empty_elems_len = len([el for el in tag_elem.getchildren() if el.tag in EMPTY_TAGS])
326+
if len(tag_elem) > max_predecessor_len + empty_elems_len:
324327
return None, None
325328
if next_text:
326329
return HTMLListItem(text=next_text, tag=next_element.tag), next_element
@@ -347,7 +350,6 @@ def is_list_item_tag(tag_elem: etree.Element) -> bool:
347350

348351
def _bulleted_text_from_table(table) -> List[Element]:
349352
"""Extracts bulletized narrative text from a table.
350-
351353
NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted.
352354
I.e., _read() will drop non-bullet narrative text in the table.
353355
"""

0 commit comments

Comments
 (0)