3535TABLE_TAGS : Final [List [str ]] = ["table" , "tbody" , "td" , "tr" ]
3636PAGEBREAK_TAGS : Final [List [str ]] = ["hr" ]
3737HEADER_OR_FOOTER_TAGS : Final [List [str ]] = ["header" , "footer" ]
38+ EMPTY_TAGS : Final [List [str ]] = ["br" , "hr" ]
3839
3940
4041class TagsMixin :
@@ -154,7 +155,6 @@ def doc_after_cleaners(
154155 """Filters the elements and returns a new instance of the class based on the criteria
155156 specified. Note that the number of pages can change in the case that all elements on a
156157 page are filtered out.
157-
158158 Parameters
159159 ----------
160160 skip_table_text:
@@ -283,7 +283,9 @@ def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
283283 """Deteremines if a tag potentially contains narrative text."""
284284 # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
285285 # it could be the text representation of a giant div
286- if len (tag_elem ) > max_predecessor_len :
286+ # Exclude empty tags from tag_elem
287+ empty_elems_len = len ([el for el in tag_elem .getchildren () if el .tag in EMPTY_TAGS ])
288+ if len (tag_elem ) > max_predecessor_len + empty_elems_len :
287289 return False
288290
289291 if tag_elem .tag in TEXT_TAGS + HEADING_TAGS :
@@ -320,7 +322,8 @@ def _process_list_item(
320322 next_text = _construct_text (next_element )
321323 # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
322324 # it could be the text representation of a giant div
323- if len (tag_elem ) > max_predecessor_len :
325+ empty_elems_len = len ([el for el in tag_elem .getchildren () if el .tag in EMPTY_TAGS ])
326+ if len (tag_elem ) > max_predecessor_len + empty_elems_len :
324327 return None , None
325328 if next_text :
326329 return HTMLListItem (text = next_text , tag = next_element .tag ), next_element
@@ -347,7 +350,6 @@ def is_list_item_tag(tag_elem: etree.Element) -> bool:
347350
348351def _bulleted_text_from_table (table ) -> List [Element ]:
349352 """Extracts bulletized narrative text from a table.
350-
351353 NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted.
352354 I.e., _read() will drop non-bullet narrative text in the table.
353355 """
0 commit comments