feat: add partition_xml for XML files (#596)

MthwRobinson · qued · web-flow · commit 23ff32cc4221 · 2023-05-18T15:40:12.000Z
* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued &lt;64741807+qued@users.noreply.github.com&gt;

* typo fix

Co-authored-by: qued &lt;64741807+qued@users.noreply.github.com&gt;

* typo fix

Co-authored-by: qued &lt;64741807+qued@users.noreply.github.com&gt;

* keep_xml_tags -&gt; xml_keep_tags

---------

Co-authored-by: Alan Bertl &lt;alan@unstructured.io&gt;
Co-authored-by: qued &lt;64741807+qued@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@
 
 ### Features
 
+* Add `partition_xml` for XML files.
 * Add `partition_xlsx` for Microsoft Excel documents.
 
 ### Fixes
diff --git a/README.md b/README.md
@@ -182,7 +182,8 @@ you can also uninstall the hooks with `pre-commit uninstall`.
 You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
 
 The following examples show how to get started with the `unstructured` library.
-You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
+
+You can parse **TXT**, **HTML**, **XML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
 **XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**,
 and **PNG** documents with one line of code!
 <br></br>
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the default kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
 If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
 ``.png``, and ``.jpg``.
@@ -371,6 +371,26 @@ to disable SSL verification in the request.
   elements = partition_html(url="https://python.org/", ssl_verify=False)
 
 
+``partition_xml``
+-----------------
+
+The ``partition_xml`` function processes XML documents.
+If ``xml_keep_tags=False``, the function only returns the text attributes from the tags.
+You can use ``xml_path`` in conjuntion with ``xml_keep_tags=False`` to restrict the text
+extraction to specific tags.
+If ``xml_keep_tags=True``, the function returns tag information in addition to tag text.
+``xml_keep_tags`` is ``False`` be default.
+
+
+.. code:: python
+
+  from unstructured.partition.xml import partition_xml
+
+  elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=True)
+
+  elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=False)
+
+
 
 ``partition_pdf``
 ---------------------
diff --git a/example-docs/factbook.xml b/example-docs/factbook.xml
diff --git a/test_unstructured/documents/test_xml.py b/test_unstructured/documents/test_xml.py
@@ -45,7 +45,7 @@ def test_from_string(sample_document):
 
 
 def test_read_with_stylesheet():
-    filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml")
+    filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
     stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
 
     xml_document = XMLDocument.from_file(filename=filename, stylesheet=stylesheet)
@@ -57,7 +57,7 @@ def test_read_with_stylesheet():
 
 
 def test_read_with_stylesheet_warns_with_html_parser(caplog):
-    filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml")
+    filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
     stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
 
     XMLDocument.from_file(filename=filename, stylesheet=stylesheet, parser=etree.HTMLParser())
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -32,7 +32,7 @@
         ("example.jpg", FileType.JPG),
         ("fake-text.txt", FileType.TXT),
         ("fake-email.eml", FileType.EML),
-        ("unsupported/factbook.xml", FileType.XML),
+        ("factbook.xml", FileType.XML),
         ("example-10k.html", FileType.HTML),
         ("fake-html.html", FileType.HTML),
         ("stanley-cups.xlsx", FileType.XLSX),
@@ -55,7 +55,7 @@ def test_detect_filetype_from_filename(file, expected):
         ("example.jpg", FileType.JPG),
         ("fake-text.txt", FileType.TXT),
         ("fake-email.eml", FileType.EML),
-        ("unsupported/factbook.xml", FileType.XML),
+        ("factbook.xml", FileType.XML),
         ("example-10k.html", FileType.HTML),
         ("fake-html.html", FileType.HTML),
         ("stanley-cups.xlsx", FileType.XLSX),
@@ -88,7 +88,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
         ("example.jpg", FileType.JPG),
         ("fake-text.txt", FileType.TXT),
         ("fake-email.eml", FileType.EML),
-        ("unsupported/factbook.xml", FileType.XML),
+        ("factbook.xml", FileType.XML),
         # NOTE(robinson) - For the document, some operating systems return
         # */xml and some return */html. Either could be acceptable depending on the OS
         ("example-10k.html", [FileType.HTML, FileType.XML]),
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -574,9 +574,7 @@ def test_auto_filetype_overrides_file_specific(content_type, expected):
     not in (
         FileType.UNK,
         FileType.ZIP,
-        FileType.XML,
         FileType.XLS,
-        FileType.XLSX,
     )
 ]
 
@@ -613,6 +611,34 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
             break
 
 
+def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
+    elements = partition(filename=filename, xml_keep_tags=False)
+
+    assert elements[0].text == "United States"
+    assert elements[0].metadata.filename == "factbook.xml"
+
+
+def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
+    with open(filename, "rb") as f:
+        elements = partition(file=f, xml_keep_tags=False)
+
+    assert elements[0].text == "United States"
+
+
+def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
+    elements = partition(filename=filename, xml_keep_tags=True)
+
+    assert elements[5].text == "<name>United States</name>"
+    assert elements[5].metadata.filename == "factbook.xml"
+
+
+def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
+    with open(filename, "rb") as f:
+        elements = partition(file=f, xml_keep_tags=True)
+
+    assert elements[5].text == "<name>United States</name>"
+
+
 EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
   <tbody>
     <tr>
diff --git a/test_unstructured/partition/test_xml_partition.py b/test_unstructured/partition/test_xml_partition.py
@@ -0,0 +1,31 @@
+from unstructured.partition.xml import partition_xml
+
+
+def test_partition_xml_from_filename(filename="example-docs/factbook.xml"):
+    elements = partition_xml(filename=filename, xml_keep_tags=False)
+
+    assert elements[0].text == "United States"
+    assert elements[0].metadata.filename == "factbook.xml"
+
+
+def test_partition_xml_from_file(filename="example-docs/factbook.xml"):
+    with open(filename, "rb") as f:
+        elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=filename)
+
+    assert elements[0].text == "United States"
+    assert elements[0].metadata.filename == "factbook.xml"
+
+
+def test_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
+    elements = partition_xml(filename=filename, xml_keep_tags=True)
+
+    assert elements[5].text == "<name>United States</name>"
+    assert elements[5].metadata.filename == "factbook.xml"
+
+
+def test_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
+    with open(filename, "rb") as f:
+        elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=filename)
+
+    assert elements[5].text == "<name>United States</name>"
+    assert elements[5].metadata.filename == "factbook.xml"
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -27,6 +27,7 @@
 from unstructured.partition.rtf import partition_rtf
 from unstructured.partition.text import partition_text
 from unstructured.partition.xlsx import partition_xlsx
+from unstructured.partition.xml import partition_xml
 
 
 def partition(
@@ -43,6 +44,7 @@ def partition(
     ssl_verify: bool = True,
     ocr_languages: str = "eng",
     pdf_infer_table_structure: bool = False,
+    xml_keep_tags: bool = False,
 ):
     """Partitions a document into its constituent elements. Will use libmagic to determine
     the file's type and route it to the appropriate partitioning function. Applies the default
@@ -83,6 +85,9 @@ def partition(
         additional metadata field, "text_as_html," where the value (string) is a just a
         transformation of the data into an HTML <table>.
         The "text" field for a partitioned Table Element is always present, whether True or False.
+    xml_keep_tags
+        If True, will retain the XML tags in the output. Otherwise it will simply extract
+        the text from within the tags. Only applies to partition_xml.
     """
     exactly_one(file=file, filename=filename, url=url)
 
@@ -126,6 +131,13 @@ def partition(
             include_page_breaks=include_page_breaks,
             encoding=encoding,
         )
+    elif filetype == FileType.XML:
+        elements = partition_xml(
+            filename=filename,
+            file=file,
+            encoding=encoding,
+            xml_keep_tags=xml_keep_tags,
+        )
     elif filetype == FileType.EPUB:
         elements = partition_epub(
             filename=filename,
diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py
@@ -33,6 +33,8 @@ def partition_text(
     text: Optional[str] = None,
     encoding: Optional[str] = "utf-8",
     paragraph_grouper: Optional[Callable[[str], str]] = None,
+    metadata_filename: Optional[str] = None,
+    include_metadata: bool = True,
 ) -> List[Element]:
     """Partitions an .txt documents into its constituent elements.
     Parameters
@@ -48,6 +50,8 @@ def partition_text(
     paragrapher_grouper
         A str -> str function for fixing paragraphs that are interrupted by line breaks
         for formatting purposes.
+    include_metadata
+        Determines whether or not metadata is included in the output.
     """
     if text is not None and text.strip() == "" and not file and not filename:
         return []
@@ -77,8 +81,12 @@ def partition_text(
 
     file_content = split_by_paragraph(file_text)
 
+    metadata_filename = metadata_filename or filename
+
     elements: List[Element] = []
-    metadata = ElementMetadata(filename=filename)
+    metadata = (
+        ElementMetadata(filename=metadata_filename) if include_metadata else ElementMetadata()
+    )
     for ctext in file_content:
         ctext = ctext.strip()
 
diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py
@@ -0,0 +1,89 @@
+import xml.etree.ElementTree as ET
+from tempfile import SpooledTemporaryFile
+from typing import IO, BinaryIO, Optional, Union, cast
+
+from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
+from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
+from unstructured.partition.text import partition_text
+
+
+def is_leaf(elem):
+    return not bool(elem)
+
+
+def get_leaf_elements(
+    filename: Optional[str] = None,
+    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
+    xml_path: str = ".",
+):
+    if filename:
+        tree = ET.parse(filename)
+    elif file:
+        f = spooled_to_bytes_io_if_needed(
+            cast(Union[BinaryIO, SpooledTemporaryFile], file),
+        )
+        tree = ET.parse(f)  # type: ignore
+
+    root = tree.getroot()
+    leaf_elements = []
+
+    for elem in root.findall(xml_path):
+        for subelem in elem.iter():
+            if is_leaf(subelem):
+                leaf_elements.append(subelem.text)
+
+    return "\n".join(leaf_elements)  # type: ignore
+
+
+@add_metadata_with_filetype(FileType.XML)
+def partition_xml(
+    filename: Optional[str] = None,
+    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
+    xml_keep_tags: bool = False,
+    xml_path: str = ".",
+    metadata_filename: Optional[str] = None,
+    include_metadata: bool = True,
+    encoding: str = "utf-8",
+):
+    """Partitions an XML document into its document elements.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    xml_keep_tags
+        If True, will retain the XML tags in the output. Otherwise it will simply extract
+        the text from within the tags.
+    xml_path
+        The xml_path to use for extracting the text. Only used if xml_keep_tags=False
+    metadata_filename
+        The filename to use for the metadata.
+    encoding
+        The encoding method used to decode the text input. If None, utf-8 will be used.
+    include_metadata
+        Determines whether or not metadata is included in the metadata attribute on the
+        elements in the output.
+    """
+    exactly_one(filename=filename, file=file)
+    metadata_filename = metadata_filename or filename
+
+    if xml_keep_tags:
+        if filename:
+            with open(filename) as f:
+                raw_text = f.read()
+        elif file:
+            f = spooled_to_bytes_io_if_needed(  # type: ignore
+                cast(Union[BinaryIO, SpooledTemporaryFile], file),
+            )
+            raw_text = f.read().decode(encoding)  # type: ignore
+    else:
+        raw_text = get_leaf_elements(filename=filename, file=file, xml_path=xml_path)
+    elements = partition_text(
+        text=raw_text,
+        metadata_filename=metadata_filename,
+        include_metadata=include_metadata,
+    )
+
+    return elements