Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test_unstructured/documents/test_html.py‎
Lines changed: 25 additions & 109 deletions b/‎test_unstructured/documents/test_html.py‎
Lines changed: 25 additions & 109 deletions
diff --git a/‎test_unstructured/partition/test_html.py‎
Lines changed: 1 addition & 1 deletion b/‎test_unstructured/partition/test_html.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unstructured/__version__.py‎
Lines changed: 1 addition & 1 deletion b/‎unstructured/__version__.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
-## 0.14.5-dev5
+## 0.14.5-dev6
 
 ### Enhancements
 
 
@@ -1,4 +1,5 @@
 # pyright: reportPrivateUsage=false
+# pyright: reportUnusedFunction=false
 
 """Test suite for `unstructured.documents.html` module."""
 
@@ -27,15 +28,14 @@
     Text,
     Title,
 )
-from unstructured.documents.html import (
+from unstructured.documents.html import HTMLDocument, _parse_HTMLTable_from_table_elem
+from unstructured.documents.html_elements import (
     HTMLAddress,
-    HTMLDocument,
     HTMLNarrativeText,
     HTMLTable,
     HTMLText,
     HTMLTitle,
     TagsMixin,
-    _parse_HTMLTable_from_table_elem,
 )
 
 TAGS = (
@@ -287,9 +287,6 @@ def test_read_html_doc(tmp_path: pathlib.Path):
         f.write(
             "<html>\n"
             "  <body>\n"
-            "    <header>\n"
-            "      <p>Here is a header. We want to ignore anything that is in this section.</p>\n"
-            "    </header>\n"
             "    <h1>A Great and Glorious Section</h1>\n"
             "    <p>Dear Leader is the best. He is such a wonderful engineer!</p>\n"
             "    <p></p>\n"
@@ -298,42 +295,35 @@ def test_read_html_doc(tmp_path: pathlib.Path):
             "    <table>\n"
             "      <tbody>\n"
             "        <tr>\n"
-            "          <td><p>Skip me because I'm in a table</p></td>\n"
+            "          <td><p>I'm in a table</p></td>\n"
             "        </tr>\n"
             "      </tbody>\n"
             "    </table>\n"
             "    <hr>\n"
             "    <h2>A New Beginning</h2>\n"
             "    <div>Here is the start of a new page.</div>\n"
-            "    <footer>\n"
-            "      <p>Here is a footer. We want to ignore anything that is in this section</p>\n"
-            "    </footer>\n"
-            "    <div>\n"
-            "      <p>Let's ignore anything after the footer too since it's probably garbage.</p>\n"
-            "    </div>\n"
             "  </body>\n"
             "</html>\n"
         )
 
-    html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners(
-        skip_headers_and_footers=True, skip_table=True
-    )
+    html_document = HTMLDocument.from_file(filename)
 
     assert len(html_document.pages) == 2
     assert all(isinstance(p, Page) for p in html_document.pages)
     # --
-    page_one = html_document.pages[0]
-    assert len(page_one.elements) == 4
-    assert page_one.elements == [
+    p = html_document.pages[0]
+    assert len(p.elements) == 5
+    assert p.elements == [
         Title("A Great and Glorious Section"),
         NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"),
         Title("Another Magnificent Title"),
         NarrativeText("The prior element is a title based on its capitalization patterns!"),
+        Table("I'm in a table"),
     ]
     # --
-    page_two = html_document.pages[1]
-    assert len(page_two.elements) == 2
-    assert page_two.elements == [
+    p = html_document.pages[1]
+    assert len(p.elements) == 2
+    assert p.elements == [
         Title("A New Beginning"),
         NarrativeText("Here is the start of a new page."),
     ]
@@ -348,64 +338,6 @@ def test_HTMLDocument_can_construct_from_existing_pages():
     assert html_document.pages == [page]
 
 
-# -- HTMLDocument.doc_after_cleaners() -----------------------------------------------------------
-
-
-def test_include_headers_and_footers(sample_doc: HTMLDocument):
-    html_document = sample_doc.doc_after_cleaners(skip_headers_and_footers=False)
-    assert len(html_document.pages[1].elements) == 3
-
-
-def test_read_without_skipping_table(is_possible_narrative_text_: Mock):
-    is_possible_narrative_text_.return_value = True
-    document = HTMLDocument.from_string(
-        "<html>\n"
-        "  <body>\n"
-        "    <table>\n"
-        "      <tbody>\n"
-        "        <tr>\n"
-        "          <td><p>Hi there! I am Matt!</p></td>\n"
-        "        </tr>\n"
-        "      </tbody>\n"
-        "    </table>\n"
-        "  </body>\n"
-        "</html>\n"
-    ).doc_after_cleaners(skip_table=False)
-    assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")
-
-
-def test_include_table_text(sample_doc: HTMLDocument):
-    html_document = sample_doc.doc_after_cleaners(skip_table=False)
-    assert len(html_document.pages[0].elements) == 2
-
-
-def test_tag_types_table(sample_doc: HTMLDocument):
-    html_document = sample_doc.doc_after_cleaners(skip_table=True)
-    assert len(html_document.pages[0].elements) == 2
-
-
-def test_cleaner_raises_on_non_element_elements(sample_doc: HTMLDocument, pages_prop_: Mock):
-    page = Page(0)
-    page.elements = [
-        "this should def not be a string"  # pyright: ignore[reportAttributeAccessIssue]
-    ]
-    pages_prop_.return_value = [page]
-    with pytest.raises(ValueError):
-        sample_doc.doc_after_cleaners()
-
-
-def test_cleaner_can_filter_out_tables_in_place():
-    doc = HTMLDocument.from_string(
-        "<table><tbody><tr><td>A table thing.</td></tr></tbody></table>\n"
-        "<p>A non-table thing</p>\n"
-    )
-    assert len(doc.elements) == 2
-
-    doc.doc_after_cleaners(skip_table=True, inplace=True)
-
-    assert len(doc.elements) == 1
-
-
 # -- HTMLDocument.elements -----------------------------------------------------------------------
 
 
@@ -429,19 +361,16 @@ def test_parses_tags_correctly():
 
 
 def test_nested_text_tags():
-    tag1, tag2 = [tag for tag in html.TEXT_TAGS if tag not in html.TABLE_TAGS][:2]
-    html_str = (
-        f"<body>\n"
-        f"    <{tag1}>\n"
-        f"        <{tag2}>\n"
-        f"            There is some text here.\n"
-        f"        </{tag2}>\n"
-        f"    </{tag1}>\n"
-        f"</body>\n"
+    html_document = HTMLDocument.from_string(
+        "<body>\n"
+        "  <p>\n"
+        "    <a>\n"
+        "      There is some text here.\n"
+        "    </a>\n"
+        "  </p>\n"
+        "</body>\n"
     )
 
-    html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)
-
     assert len(html_document.pages[0].elements) == 1
 
 
@@ -575,19 +504,6 @@ def test_exclude_tag_types(tag: str):
     assert len(html_document.pages) == 0
 
 
-# -- has_table_ancestor() ------------------------------------------------------------------------
-
-
-def test_has_table_ancestor():
-    title = HTMLTitle("I am a Title", tag="td", ancestortags=["html", "body", "table", "tr"])
-    assert html.has_table_ancestor(title)
-
-
-def test_has_no_table_ancestor():
-    title = HTMLTitle("I am a Title", tag="p", ancestortags=["html", "body"])
-    assert not html.has_table_ancestor(title)
-
-
 # -- _bulleted_text_from_table() -----------------------------------------------------------------
 
 
@@ -856,8 +772,8 @@ def test_parse_nothing():
     assert parsed_el is None
 
 
-def test_parse_not_anything(is_narrative_tag_: Mock, is_possible_title_: Mock):
-    is_narrative_tag_.return_value = False
+def test_parse_not_anything(_is_narrative_tag_: Mock, is_possible_title_: Mock):  # noqa: PT019
+    _is_narrative_tag_.return_value = False
     is_possible_title_.return_value = False
     doc = """<p>This is nothing</p>"""
     document_tree = etree.fromstring(doc, etree.HTMLParser())
@@ -942,7 +858,7 @@ def test_process_list_item_returns_none_if_next_has_no_text():
     document_tree = etree.fromstring(doc, etree.HTMLParser())
     el = document_tree.find(".//div")
     assert el is not None
-    assert html.is_list_item_tag(el) is True
+    assert html._is_list_item_tag(el) is True
     parsed_el, _ = html._process_list_item(el)
     assert parsed_el is None
 
@@ -1071,8 +987,8 @@ def it_reduces_a_nested_table_to_its_text_placed_in_the_cell_containing_the_nest
 
 
 @pytest.fixture
-def is_narrative_tag_(request: FixtureRequest):
-    return function_mock(request, "unstructured.documents.html.is_narrative_tag")
+def _is_narrative_tag_(request: FixtureRequest):
+    return function_mock(request, "unstructured.documents.html._is_narrative_tag")
 
 
 @pytest.fixture
 
@@ -28,7 +28,7 @@
     TableChunk,
     Title,
 )
-from unstructured.documents.html import HTMLTable, TagsMixin
+from unstructured.documents.html_elements import HTMLTable, TagsMixin
 from unstructured.partition.html import partition_html
 
 # -- document-source (filename, file, text, url) -------------------------------------------------
 
@@ -1 +1 @@
-__version__ = "0.14.5-dev5"  # pragma: no cover
+__version__ = "0.14.5-dev6"  # pragma: no cover
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-## 0.14.5-dev5`
	`1`	`+## 0.14.5-dev6`
`2`	`2`
`3`	`3`	`### Enhancements`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@`
`28`	`28`	`TableChunk,`
`29`	`29`	`Title,`
`30`	`30`	`)`
`31`		`-from unstructured.documents.html import HTMLTable, TagsMixin`
	`31`	`+from unstructured.documents.html_elements import HTMLTable, TagsMixin`
`32`	`32`	`from unstructured.partition.html import partition_html`
`33`	`33`
`34`	`34`	`# -- document-source (filename, file, text, url) -------------------------------------------------`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.14.5-dev5" # pragma: no cover`
	`1`	`+__version__ = "0.14.5-dev6" # pragma: no cover`