11# pyright: reportPrivateUsage=false
2+ # pyright: reportUnusedFunction=false
23
34"""Test suite for `unstructured.documents.html` module."""
45
2728 Text ,
2829 Title ,
2930)
30- from unstructured .documents .html import (
31+ from unstructured .documents .html import HTMLDocument , _parse_HTMLTable_from_table_elem
32+ from unstructured .documents .html_elements import (
3133 HTMLAddress ,
32- HTMLDocument ,
3334 HTMLNarrativeText ,
3435 HTMLTable ,
3536 HTMLText ,
3637 HTMLTitle ,
3738 TagsMixin ,
38- _parse_HTMLTable_from_table_elem ,
3939)
4040
4141TAGS = (
@@ -287,9 +287,6 @@ def test_read_html_doc(tmp_path: pathlib.Path):
287287 f .write (
288288 "<html>\n "
289289 " <body>\n "
290- " <header>\n "
291- " <p>Here is a header. We want to ignore anything that is in this section.</p>\n "
292- " </header>\n "
293290 " <h1>A Great and Glorious Section</h1>\n "
294291 " <p>Dear Leader is the best. He is such a wonderful engineer!</p>\n "
295292 " <p></p>\n "
@@ -298,42 +295,35 @@ def test_read_html_doc(tmp_path: pathlib.Path):
298295 " <table>\n "
299296 " <tbody>\n "
300297 " <tr>\n "
301- " <td><p>Skip me because I'm in a table</p></td>\n "
298+ " <td><p>I'm in a table</p></td>\n "
302299 " </tr>\n "
303300 " </tbody>\n "
304301 " </table>\n "
305302 " <hr>\n "
306303 " <h2>A New Beginning</h2>\n "
307304 " <div>Here is the start of a new page.</div>\n "
308- " <footer>\n "
309- " <p>Here is a footer. We want to ignore anything that is in this section</p>\n "
310- " </footer>\n "
311- " <div>\n "
312- " <p>Let's ignore anything after the footer too since it's probably garbage.</p>\n "
313- " </div>\n "
314305 " </body>\n "
315306 "</html>\n "
316307 )
317308
318- html_document = HTMLDocument .from_file (filename = filename ).doc_after_cleaners (
319- skip_headers_and_footers = True , skip_table = True
320- )
309+ html_document = HTMLDocument .from_file (filename )
321310
322311 assert len (html_document .pages ) == 2
323312 assert all (isinstance (p , Page ) for p in html_document .pages )
324313 # --
325- page_one = html_document .pages [0 ]
326- assert len (page_one .elements ) == 4
327- assert page_one .elements == [
314+ p = html_document .pages [0 ]
315+ assert len (p .elements ) == 5
316+ assert p .elements == [
328317 Title ("A Great and Glorious Section" ),
329318 NarrativeText ("Dear Leader is the best. He is such a wonderful engineer!" ),
330319 Title ("Another Magnificent Title" ),
331320 NarrativeText ("The prior element is a title based on its capitalization patterns!" ),
321+ Table ("I'm in a table" ),
332322 ]
333323 # --
334- page_two = html_document .pages [1 ]
335- assert len (page_two .elements ) == 2
336- assert page_two .elements == [
324+ p = html_document .pages [1 ]
325+ assert len (p .elements ) == 2
326+ assert p .elements == [
337327 Title ("A New Beginning" ),
338328 NarrativeText ("Here is the start of a new page." ),
339329 ]
@@ -348,64 +338,6 @@ def test_HTMLDocument_can_construct_from_existing_pages():
348338 assert html_document .pages == [page ]
349339
350340
351- # -- HTMLDocument.doc_after_cleaners() -----------------------------------------------------------
352-
353-
354- def test_include_headers_and_footers (sample_doc : HTMLDocument ):
355- html_document = sample_doc .doc_after_cleaners (skip_headers_and_footers = False )
356- assert len (html_document .pages [1 ].elements ) == 3
357-
358-
359- def test_read_without_skipping_table (is_possible_narrative_text_ : Mock ):
360- is_possible_narrative_text_ .return_value = True
361- document = HTMLDocument .from_string (
362- "<html>\n "
363- " <body>\n "
364- " <table>\n "
365- " <tbody>\n "
366- " <tr>\n "
367- " <td><p>Hi there! I am Matt!</p></td>\n "
368- " </tr>\n "
369- " </tbody>\n "
370- " </table>\n "
371- " </body>\n "
372- "</html>\n "
373- ).doc_after_cleaners (skip_table = False )
374- assert document .pages [0 ].elements [0 ] == Table (text = "Hi there! I am Matt!" )
375-
376-
377- def test_include_table_text (sample_doc : HTMLDocument ):
378- html_document = sample_doc .doc_after_cleaners (skip_table = False )
379- assert len (html_document .pages [0 ].elements ) == 2
380-
381-
382- def test_tag_types_table (sample_doc : HTMLDocument ):
383- html_document = sample_doc .doc_after_cleaners (skip_table = True )
384- assert len (html_document .pages [0 ].elements ) == 2
385-
386-
387- def test_cleaner_raises_on_non_element_elements (sample_doc : HTMLDocument , pages_prop_ : Mock ):
388- page = Page (0 )
389- page .elements = [
390- "this should def not be a string" # pyright: ignore[reportAttributeAccessIssue]
391- ]
392- pages_prop_ .return_value = [page ]
393- with pytest .raises (ValueError ):
394- sample_doc .doc_after_cleaners ()
395-
396-
397- def test_cleaner_can_filter_out_tables_in_place ():
398- doc = HTMLDocument .from_string (
399- "<table><tbody><tr><td>A table thing.</td></tr></tbody></table>\n "
400- "<p>A non-table thing</p>\n "
401- )
402- assert len (doc .elements ) == 2
403-
404- doc .doc_after_cleaners (skip_table = True , inplace = True )
405-
406- assert len (doc .elements ) == 1
407-
408-
409341# -- HTMLDocument.elements -----------------------------------------------------------------------
410342
411343
@@ -429,19 +361,16 @@ def test_parses_tags_correctly():
429361
430362
431363def test_nested_text_tags ():
432- tag1 , tag2 = [tag for tag in html .TEXT_TAGS if tag not in html .TABLE_TAGS ][:2 ]
433- html_str = (
434- f"<body>\n "
435- f" <{ tag1 } >\n "
436- f" <{ tag2 } >\n "
437- f" There is some text here.\n "
438- f" </{ tag2 } >\n "
439- f" </{ tag1 } >\n "
440- f"</body>\n "
364+ html_document = HTMLDocument .from_string (
365+ "<body>\n "
366+ " <p>\n "
367+ " <a>\n "
368+ " There is some text here.\n "
369+ " </a>\n "
370+ " </p>\n "
371+ "</body>\n "
441372 )
442373
443- html_document = HTMLDocument .from_string (html_str ).doc_after_cleaners (skip_table = False )
444-
445374 assert len (html_document .pages [0 ].elements ) == 1
446375
447376
@@ -575,19 +504,6 @@ def test_exclude_tag_types(tag: str):
575504 assert len (html_document .pages ) == 0
576505
577506
578- # -- has_table_ancestor() ------------------------------------------------------------------------
579-
580-
581- def test_has_table_ancestor ():
582- title = HTMLTitle ("I am a Title" , tag = "td" , ancestortags = ["html" , "body" , "table" , "tr" ])
583- assert html .has_table_ancestor (title )
584-
585-
586- def test_has_no_table_ancestor ():
587- title = HTMLTitle ("I am a Title" , tag = "p" , ancestortags = ["html" , "body" ])
588- assert not html .has_table_ancestor (title )
589-
590-
591507# -- _bulleted_text_from_table() -----------------------------------------------------------------
592508
593509
@@ -856,8 +772,8 @@ def test_parse_nothing():
856772 assert parsed_el is None
857773
858774
859- def test_parse_not_anything (is_narrative_tag_ : Mock , is_possible_title_ : Mock ):
860- is_narrative_tag_ .return_value = False
775+ def test_parse_not_anything (_is_narrative_tag_ : Mock , is_possible_title_ : Mock ): # noqa: PT019
776+ _is_narrative_tag_ .return_value = False
861777 is_possible_title_ .return_value = False
862778 doc = """<p>This is nothing</p>"""
863779 document_tree = etree .fromstring (doc , etree .HTMLParser ())
@@ -942,7 +858,7 @@ def test_process_list_item_returns_none_if_next_has_no_text():
942858 document_tree = etree .fromstring (doc , etree .HTMLParser ())
943859 el = document_tree .find (".//div" )
944860 assert el is not None
945- assert html .is_list_item_tag (el ) is True
861+ assert html ._is_list_item_tag (el ) is True
946862 parsed_el , _ = html ._process_list_item (el )
947863 assert parsed_el is None
948864
@@ -1071,8 +987,8 @@ def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nest
1071987
1072988
1073989@pytest .fixture
1074- def is_narrative_tag_ (request : FixtureRequest ):
1075- return function_mock (request , "unstructured.documents.html.is_narrative_tag " )
990+ def _is_narrative_tag_ (request : FixtureRequest ):
991+ return function_mock (request , "unstructured.documents.html._is_narrative_tag " )
1076992
1077993
1078994@pytest .fixture
0 commit comments