fix: enable partition_html to grab content outside of <article> tags (#772)

MthwRobinson · web-flow · commit c53ce117bca0 · 2023-06-20T17:07:30.000Z
* optionally dont assemble articles

* add test for content outside of articles

* pass kwargs in partition

* changelog and version

* update default to False

* bump version for release

* back to dev version to get another fix in the release
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,9 @@
 
 ### Fixes
 
+* Adds an `html_assemble_articles` kwarg to `partition_html` to enable users to capture
+  control whether content outside of `<article>` tags is captured when
+  `<article>` tags are present.
 * Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`.
 
 ## 0.7.6
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -329,6 +329,15 @@ to disable SSL verification in the request.
   elements = partition_html(url="https://python.org/", ssl_verify=False)
 
 
+
+If you website contains news articles, it can be helpful to only grab content that appears in
+between the ``<article>`` tags, if the site uses that convention.
+To activate this behavior, you can set ``html_assemble_articles=True``.
+If ``html_assemble_articles`` is ``True``, each ``<article>`` tag will be treated as a a page.
+If ``html_assemble_articles`` is ``True`` and no ``<article>`` tags are present, the behavior
+is the same as ``html_assemble_articles=False``.
+
+
 ``partition_image``
 ---------------------
 
diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py
@@ -246,3 +246,20 @@ def test_emoji_appears_with_emoji_utf8_code():
     html_text = """\n<html charset="utf-8"><p>Hello &#128512;</p></html>"""
     elements = partition_html(text=html_text)
     assert elements[0] == Title("Hello 😀")
+
+
+def test_partition_html_can_turn_off_assemble_articles():
+    html_text = """<html>
+    <article>
+        <h1>Some important stuff is going on!</h1>
+        <p>Here is a description of that stuff</p>
+    </article>
+    <article>
+        <h1>Some other important stuff is going on!</h1>
+        <p>Here is a description of that stuff</p>
+    </article>
+    <h4>This is outside of the article.</h4>
+</html>
+"""
+    elements = partition_html(text=html_text, html_assemble_articles=False)
+    assert elements[-1] == Title("This is outside of the article.")
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
@@ -20,7 +20,7 @@
     Text,
     Title,
 )
-from unstructured.documents.xml import XMLDocument
+from unstructured.documents.xml import VALID_PARSERS, XMLDocument
 from unstructured.logger import logger
 from unstructured.partition.text_type import (
     is_bulleted_text,
@@ -90,6 +90,15 @@ class HTMLDocument(XMLDocument):
     """Class for handling HTML documents. Uses rules based parsing to identify sections
     of interest within the document."""
 
+    def __init__(
+        self,
+        stylesheet: Optional[str] = None,
+        parser: VALID_PARSERS = None,
+        assemble_articles: bool = True,
+    ):
+        self.assembled_articles = assemble_articles
+        super().__init__(stylesheet=stylesheet, parser=parser)
+
     def _read(self) -> List[Page]:
         """Reads and structures and HTML document. If present, looks for article tags.
         if there are multiple article sections present, a page break is inserted between them.
@@ -101,7 +110,7 @@ def _read(self) -> List[Page]:
         etree.strip_elements(self.document_tree, ["script"])
         root = _find_main(self.document_tree)
 
-        articles = _find_articles(root)
+        articles = _find_articles(root, assemble_articles=self.assembled_articles)
         page_number = 0
         page = Page(number=page_number)
         for article in articles:
@@ -407,9 +416,12 @@ def _find_main(root: etree.Element) -> etree.Element:
     return main_tag_elem if main_tag_elem is not None else root
 
 
-def _find_articles(root: etree.Element) -> List[etree.Element]:
+def _find_articles(root: etree.Element, assemble_articles: bool = True) -> List[etree.Element]:
     """Tries to break the HTML document into distinct articles. If there are no article
     tags, the entire document is returned as a single item list."""
+    if assemble_articles is False:
+        return root
+
     articles = root.findall(".//article")
     if len(articles) == 0:
         # NOTE(robinson) - ref: https://schema.org/Article
diff --git a/unstructured/documents/xml.py b/unstructured/documents/xml.py
@@ -92,10 +92,16 @@ def _read_xml(self, content):
         return self.document_tree
 
     @classmethod
-    def from_string(cls, text: str, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
+    def from_string(
+        cls,
+        text: str,
+        parser: VALID_PARSERS = None,
+        stylesheet: Optional[str] = None,
+        **kwargs,
+    ):
         """Supports reading in an XML file as a raw string rather than as a file."""
         logger.info("Reading document from string ...")
-        doc = cls(parser=parser, stylesheet=stylesheet)
+        doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
         doc._read_xml(text)
         return doc
 
@@ -106,6 +112,7 @@ def from_file(
         parser: VALID_PARSERS = None,
         stylesheet: Optional[str] = None,
         encoding: Optional[str] = None,
+        **kwargs,
     ):
         _, content = read_txt_file(filename=filename, encoding=encoding)
-        return cls.from_string(content, parser=parser, stylesheet=stylesheet)
+        return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -50,6 +50,7 @@ def partition(
     pdf_infer_table_structure: bool = False,
     xml_keep_tags: bool = False,
     data_source_metadata: Optional[DataSourceMetadata] = None,
+    **kwargs,
 ):
     """Partitions a document into its constituent elements. Will use libmagic to determine
     the file's type and route it to the appropriate partitioning function. Applies the default
@@ -121,46 +122,51 @@ def partition(
         file.seek(0)
 
     if filetype == FileType.DOC:
-        elements = partition_doc(filename=filename, file=file)
+        elements = partition_doc(filename=filename, file=file, **kwargs)
     elif filetype == FileType.DOCX:
-        elements = partition_docx(filename=filename, file=file)
+        elements = partition_docx(filename=filename, file=file, **kwargs)
     elif filetype == FileType.ODT:
-        elements = partition_odt(filename=filename, file=file)
+        elements = partition_odt(filename=filename, file=file, **kwargs)
     elif filetype == FileType.EML:
-        elements = partition_email(filename=filename, file=file, encoding=encoding)
+        elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
     elif filetype == FileType.MSG:
-        elements = partition_msg(filename=filename, file=file)
+        elements = partition_msg(filename=filename, file=file, **kwargs)
     elif filetype == FileType.HTML:
         elements = partition_html(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
             encoding=encoding,
+            **kwargs,
         )
     elif filetype == FileType.XML:
         elements = partition_xml(
             filename=filename,
             file=file,
             encoding=encoding,
             xml_keep_tags=xml_keep_tags,
+            **kwargs,
         )
     elif filetype == FileType.EPUB:
         elements = partition_epub(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
+            **kwargs,
         )
     elif filetype == FileType.RST:
         elements = partition_rst(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
+            **kwargs,
         )
     elif filetype == FileType.MD:
         elements = partition_md(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
+            **kwargs,
         )
     elif filetype == FileType.PDF:
         elements = partition_pdf(
@@ -171,6 +177,7 @@ def partition(
             infer_table_structure=pdf_infer_table_structure,
             strategy=strategy,
             ocr_languages=ocr_languages,
+            **kwargs,
         )
     elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
         elements = partition_image(
@@ -180,40 +187,45 @@ def partition(
             include_page_breaks=include_page_breaks,
             strategy=strategy,
             ocr_languages=ocr_languages,
+            **kwargs,
         )
     elif filetype == FileType.TXT:
         elements = partition_text(
             filename=filename,
             file=file,
             encoding=encoding,
             paragraph_grouper=paragraph_grouper,
+            **kwargs,
         )
     elif filetype == FileType.RTF:
         elements = partition_rtf(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
+            **kwargs,
         )
     elif filetype == FileType.PPT:
         elements = partition_ppt(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
+            **kwargs,
         )
     elif filetype == FileType.PPTX:
         elements = partition_pptx(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
+            **kwargs,
         )
     elif filetype == FileType.JSON:
-        elements = partition_json(filename=filename, file=file)
+        elements = partition_json(filename=filename, file=file, **kwargs)
     elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
-        elements = partition_xlsx(filename=filename, file=file)
+        elements = partition_xlsx(filename=filename, file=file, **kwargs)
     elif filetype == FileType.CSV:
-        elements = partition_csv(filename=filename, file=file)
+        elements = partition_csv(filename=filename, file=file, **kwargs)
     elif filetype == FileType.TSV:
-        elements = partition_tsv(filename=filename, file=file)
+        elements = partition_tsv(filename=filename, file=file, **kwargs)
     elif filetype == FileType.EMPTY:
         elements = []
     else:
diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py
@@ -30,6 +30,7 @@ def partition_html(
     headers: Dict[str, str] = {},
     ssl_verify: bool = True,
     parser: VALID_PARSERS = None,
+    html_assemble_articles: bool = False,
     **kwargs,
 ) -> List[Element]:
     """Partitions an HTML document into its constituent elements.
@@ -66,15 +67,28 @@ def partition_html(
     exactly_one(filename=filename, file=file, text=text, url=url)
 
     if filename is not None:
-        document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)
+        document = HTMLDocument.from_file(
+            filename,
+            parser=parser,
+            encoding=encoding,
+            assemble_articles=html_assemble_articles,
+        )
 
     elif file is not None:
         _, file_text = read_txt_file(file=file, encoding=encoding)
-        document = HTMLDocument.from_string(file_text, parser=parser)
+        document = HTMLDocument.from_string(
+            file_text,
+            parser=parser,
+            assemble_articles=html_assemble_articles,
+        )
 
     elif text is not None:
         _text: str = str(text)
-        document = HTMLDocument.from_string(_text, parser=parser)
+        document = HTMLDocument.from_string(
+            _text,
+            parser=parser,
+            assemble_articles=html_assemble_articles,
+        )
 
     elif url is not None:
         response = requests.get(url, headers=headers, verify=ssl_verify)