Skip to content

Commit a883fc9

Browse files
authored
rfctr(html): improve SNR in HTMLDocument (#3162)
**Summary** Remove dead code and organize helpers of HTMLDocument in preparation for improvements and bug-fixes to follow
1 parent 8378dda commit a883fc9

File tree

7 files changed

+395
-538
lines changed

7 files changed

+395
-538
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.5-dev5
1+
## 0.14.5-dev6
22

33
### Enhancements
44

test_unstructured/documents/test_html.py

Lines changed: 25 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# pyright: reportPrivateUsage=false
2+
# pyright: reportUnusedFunction=false
23

34
"""Test suite for `unstructured.documents.html` module."""
45

@@ -27,15 +28,14 @@
2728
Text,
2829
Title,
2930
)
30-
from unstructured.documents.html import (
31+
from unstructured.documents.html import HTMLDocument, _parse_HTMLTable_from_table_elem
32+
from unstructured.documents.html_elements import (
3133
HTMLAddress,
32-
HTMLDocument,
3334
HTMLNarrativeText,
3435
HTMLTable,
3536
HTMLText,
3637
HTMLTitle,
3738
TagsMixin,
38-
_parse_HTMLTable_from_table_elem,
3939
)
4040

4141
TAGS = (
@@ -287,9 +287,6 @@ def test_read_html_doc(tmp_path: pathlib.Path):
287287
f.write(
288288
"<html>\n"
289289
" <body>\n"
290-
" <header>\n"
291-
" <p>Here is a header. We want to ignore anything that is in this section.</p>\n"
292-
" </header>\n"
293290
" <h1>A Great and Glorious Section</h1>\n"
294291
" <p>Dear Leader is the best. He is such a wonderful engineer!</p>\n"
295292
" <p></p>\n"
@@ -298,42 +295,35 @@ def test_read_html_doc(tmp_path: pathlib.Path):
298295
" <table>\n"
299296
" <tbody>\n"
300297
" <tr>\n"
301-
" <td><p>Skip me because I'm in a table</p></td>\n"
298+
" <td><p>I'm in a table</p></td>\n"
302299
" </tr>\n"
303300
" </tbody>\n"
304301
" </table>\n"
305302
" <hr>\n"
306303
" <h2>A New Beginning</h2>\n"
307304
" <div>Here is the start of a new page.</div>\n"
308-
" <footer>\n"
309-
" <p>Here is a footer. We want to ignore anything that is in this section</p>\n"
310-
" </footer>\n"
311-
" <div>\n"
312-
" <p>Let's ignore anything after the footer too since it's probably garbage.</p>\n"
313-
" </div>\n"
314305
" </body>\n"
315306
"</html>\n"
316307
)
317308

318-
html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners(
319-
skip_headers_and_footers=True, skip_table=True
320-
)
309+
html_document = HTMLDocument.from_file(filename)
321310

322311
assert len(html_document.pages) == 2
323312
assert all(isinstance(p, Page) for p in html_document.pages)
324313
# --
325-
page_one = html_document.pages[0]
326-
assert len(page_one.elements) == 4
327-
assert page_one.elements == [
314+
p = html_document.pages[0]
315+
assert len(p.elements) == 5
316+
assert p.elements == [
328317
Title("A Great and Glorious Section"),
329318
NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"),
330319
Title("Another Magnificent Title"),
331320
NarrativeText("The prior element is a title based on its capitalization patterns!"),
321+
Table("I'm in a table"),
332322
]
333323
# --
334-
page_two = html_document.pages[1]
335-
assert len(page_two.elements) == 2
336-
assert page_two.elements == [
324+
p = html_document.pages[1]
325+
assert len(p.elements) == 2
326+
assert p.elements == [
337327
Title("A New Beginning"),
338328
NarrativeText("Here is the start of a new page."),
339329
]
@@ -348,64 +338,6 @@ def test_HTMLDocument_can_construct_from_existing_pages():
348338
assert html_document.pages == [page]
349339

350340

351-
# -- HTMLDocument.doc_after_cleaners() -----------------------------------------------------------
352-
353-
354-
def test_include_headers_and_footers(sample_doc: HTMLDocument):
355-
html_document = sample_doc.doc_after_cleaners(skip_headers_and_footers=False)
356-
assert len(html_document.pages[1].elements) == 3
357-
358-
359-
def test_read_without_skipping_table(is_possible_narrative_text_: Mock):
360-
is_possible_narrative_text_.return_value = True
361-
document = HTMLDocument.from_string(
362-
"<html>\n"
363-
" <body>\n"
364-
" <table>\n"
365-
" <tbody>\n"
366-
" <tr>\n"
367-
" <td><p>Hi there! I am Matt!</p></td>\n"
368-
" </tr>\n"
369-
" </tbody>\n"
370-
" </table>\n"
371-
" </body>\n"
372-
"</html>\n"
373-
).doc_after_cleaners(skip_table=False)
374-
assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")
375-
376-
377-
def test_include_table_text(sample_doc: HTMLDocument):
378-
html_document = sample_doc.doc_after_cleaners(skip_table=False)
379-
assert len(html_document.pages[0].elements) == 2
380-
381-
382-
def test_tag_types_table(sample_doc: HTMLDocument):
383-
html_document = sample_doc.doc_after_cleaners(skip_table=True)
384-
assert len(html_document.pages[0].elements) == 2
385-
386-
387-
def test_cleaner_raises_on_non_element_elements(sample_doc: HTMLDocument, pages_prop_: Mock):
388-
page = Page(0)
389-
page.elements = [
390-
"this should def not be a string" # pyright: ignore[reportAttributeAccessIssue]
391-
]
392-
pages_prop_.return_value = [page]
393-
with pytest.raises(ValueError):
394-
sample_doc.doc_after_cleaners()
395-
396-
397-
def test_cleaner_can_filter_out_tables_in_place():
398-
doc = HTMLDocument.from_string(
399-
"<table><tbody><tr><td>A table thing.</td></tr></tbody></table>\n"
400-
"<p>A non-table thing</p>\n"
401-
)
402-
assert len(doc.elements) == 2
403-
404-
doc.doc_after_cleaners(skip_table=True, inplace=True)
405-
406-
assert len(doc.elements) == 1
407-
408-
409341
# -- HTMLDocument.elements -----------------------------------------------------------------------
410342

411343

@@ -429,19 +361,16 @@ def test_parses_tags_correctly():
429361

430362

431363
def test_nested_text_tags():
432-
tag1, tag2 = [tag for tag in html.TEXT_TAGS if tag not in html.TABLE_TAGS][:2]
433-
html_str = (
434-
f"<body>\n"
435-
f" <{tag1}>\n"
436-
f" <{tag2}>\n"
437-
f" There is some text here.\n"
438-
f" </{tag2}>\n"
439-
f" </{tag1}>\n"
440-
f"</body>\n"
364+
html_document = HTMLDocument.from_string(
365+
"<body>\n"
366+
" <p>\n"
367+
" <a>\n"
368+
" There is some text here.\n"
369+
" </a>\n"
370+
" </p>\n"
371+
"</body>\n"
441372
)
442373

443-
html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)
444-
445374
assert len(html_document.pages[0].elements) == 1
446375

447376

@@ -575,19 +504,6 @@ def test_exclude_tag_types(tag: str):
575504
assert len(html_document.pages) == 0
576505

577506

578-
# -- has_table_ancestor() ------------------------------------------------------------------------
579-
580-
581-
def test_has_table_ancestor():
582-
title = HTMLTitle("I am a Title", tag="td", ancestortags=["html", "body", "table", "tr"])
583-
assert html.has_table_ancestor(title)
584-
585-
586-
def test_has_no_table_ancestor():
587-
title = HTMLTitle("I am a Title", tag="p", ancestortags=["html", "body"])
588-
assert not html.has_table_ancestor(title)
589-
590-
591507
# -- _bulleted_text_from_table() -----------------------------------------------------------------
592508

593509

@@ -856,8 +772,8 @@ def test_parse_nothing():
856772
assert parsed_el is None
857773

858774

859-
def test_parse_not_anything(is_narrative_tag_: Mock, is_possible_title_: Mock):
860-
is_narrative_tag_.return_value = False
775+
def test_parse_not_anything(_is_narrative_tag_: Mock, is_possible_title_: Mock): # noqa: PT019
776+
_is_narrative_tag_.return_value = False
861777
is_possible_title_.return_value = False
862778
doc = """<p>This is nothing</p>"""
863779
document_tree = etree.fromstring(doc, etree.HTMLParser())
@@ -942,7 +858,7 @@ def test_process_list_item_returns_none_if_next_has_no_text():
942858
document_tree = etree.fromstring(doc, etree.HTMLParser())
943859
el = document_tree.find(".//div")
944860
assert el is not None
945-
assert html.is_list_item_tag(el) is True
861+
assert html._is_list_item_tag(el) is True
946862
parsed_el, _ = html._process_list_item(el)
947863
assert parsed_el is None
948864

@@ -1071,8 +987,8 @@ def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nest
1071987

1072988

1073989
@pytest.fixture
1074-
def is_narrative_tag_(request: FixtureRequest):
1075-
return function_mock(request, "unstructured.documents.html.is_narrative_tag")
990+
def _is_narrative_tag_(request: FixtureRequest):
991+
return function_mock(request, "unstructured.documents.html._is_narrative_tag")
1076992

1077993

1078994
@pytest.fixture

test_unstructured/partition/test_html.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
TableChunk,
2929
Title,
3030
)
31-
from unstructured.documents.html import HTMLTable, TagsMixin
31+
from unstructured.documents.html_elements import HTMLTable, TagsMixin
3232
from unstructured.partition.html import partition_html
3333

3434
# -- document-source (filename, file, text, url) -------------------------------------------------

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.5-dev5" # pragma: no cover
1+
__version__ = "0.14.5-dev6" # pragma: no cover

0 commit comments

Comments
 (0)