Skip to content

Commit 23ff32c

Browse files
MthwRobinsonqued
andauthored
feat: add partition_xml for XML files (#596)
* first pass on partition_xml * add option to keep xml tags * added tests for xml * fix filename * update filenames * remove outdated readme * add xml to auto * version and changelog * update readme and docs * pass through include_metadata * update include_metadata description * add README back in * linting, linting, linting * more linting * spooled to bytes doesnt need to be a tuple * Add tests for newly supported filetypes * Correct metadata filetype * doc typo Co-authored-by: qued <[email protected]> * typo fix Co-authored-by: qued <[email protected]> * typo fix Co-authored-by: qued <[email protected]> * keep_xml_tags -> xml_keep_tags --------- Co-authored-by: Alan Bertl <[email protected]> Co-authored-by: qued <[email protected]>
1 parent b6bfbf9 commit 23ff32c

File tree

11 files changed

+198
-10
lines changed

11 files changed

+198
-10
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
### Features
1515

16+
* Add `partition_xml` for XML files.
1617
* Add `partition_xlsx` for Microsoft Excel documents.
1718

1819
### Fixes

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,8 @@ you can also uninstall the hooks with `pre-commit uninstall`.
182182
You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
183183

184184
The following examples show how to get started with the `unstructured` library.
185-
You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
185+
186+
You can parse **TXT**, **HTML**, **XML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
186187
**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**,
187188
and **PNG** documents with one line of code!
188189
<br></br>

docs/source/bricks.rst

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
8383
file type and route it to the appropriate partitioning brick. All partitioning bricks
8484
called within ``partition`` are called using the default kwargs. Use the document-type
8585
specific bricks if you need to apply non-default settings.
86-
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
86+
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
8787
``.png``, ``.jpg``, and ``.txt`` files.
8888
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
8989
``.png``, and ``.jpg``.
@@ -371,6 +371,26 @@ to disable SSL verification in the request.
371371
elements = partition_html(url="https://python.org/", ssl_verify=False)
372372
373373
374+
``partition_xml``
375+
-----------------
376+
377+
The ``partition_xml`` function processes XML documents.
378+
If ``xml_keep_tags=False``, the function only returns the text attributes from the tags.
379+
You can use ``xml_path`` in conjuntion with ``xml_keep_tags=False`` to restrict the text
380+
extraction to specific tags.
381+
If ``xml_keep_tags=True``, the function returns tag information in addition to tag text.
382+
``xml_keep_tags`` is ``False`` be default.
383+
384+
385+
.. code:: python
386+
387+
from unstructured.partition.xml import partition_xml
388+
389+
elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=True)
390+
391+
elements = partition_xml(filename="example-docs/factbook.xml", xml_keep_tags=False)
392+
393+
374394
375395
``partition_pdf``
376396
---------------------

test_unstructured/documents/test_xml.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_from_string(sample_document):
4545

4646

4747
def test_read_with_stylesheet():
48-
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml")
48+
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
4949
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
5050

5151
xml_document = XMLDocument.from_file(filename=filename, stylesheet=stylesheet)
@@ -57,7 +57,7 @@ def test_read_with_stylesheet():
5757

5858

5959
def test_read_with_stylesheet_warns_with_html_parser(caplog):
60-
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xml")
60+
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
6161
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
6262

6363
XMLDocument.from_file(filename=filename, stylesheet=stylesheet, parser=etree.HTMLParser())

test_unstructured/file_utils/test_filetype.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
("example.jpg", FileType.JPG),
3333
("fake-text.txt", FileType.TXT),
3434
("fake-email.eml", FileType.EML),
35-
("unsupported/factbook.xml", FileType.XML),
35+
("factbook.xml", FileType.XML),
3636
("example-10k.html", FileType.HTML),
3737
("fake-html.html", FileType.HTML),
3838
("stanley-cups.xlsx", FileType.XLSX),
@@ -55,7 +55,7 @@ def test_detect_filetype_from_filename(file, expected):
5555
("example.jpg", FileType.JPG),
5656
("fake-text.txt", FileType.TXT),
5757
("fake-email.eml", FileType.EML),
58-
("unsupported/factbook.xml", FileType.XML),
58+
("factbook.xml", FileType.XML),
5959
("example-10k.html", FileType.HTML),
6060
("fake-html.html", FileType.HTML),
6161
("stanley-cups.xlsx", FileType.XLSX),
@@ -88,7 +88,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
8888
("example.jpg", FileType.JPG),
8989
("fake-text.txt", FileType.TXT),
9090
("fake-email.eml", FileType.EML),
91-
("unsupported/factbook.xml", FileType.XML),
91+
("factbook.xml", FileType.XML),
9292
# NOTE(robinson) - For the document, some operating systems return
9393
# */xml and some return */html. Either could be acceptable depending on the OS
9494
("example-10k.html", [FileType.HTML, FileType.XML]),

test_unstructured/partition/test_auto.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -574,9 +574,7 @@ def test_auto_filetype_overrides_file_specific(content_type, expected):
574574
not in (
575575
FileType.UNK,
576576
FileType.ZIP,
577-
FileType.XML,
578577
FileType.XLS,
579-
FileType.XLSX,
580578
)
581579
]
582580

@@ -613,6 +611,34 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
613611
break
614612

615613

614+
def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
615+
elements = partition(filename=filename, xml_keep_tags=False)
616+
617+
assert elements[0].text == "United States"
618+
assert elements[0].metadata.filename == "factbook.xml"
619+
620+
621+
def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
622+
with open(filename, "rb") as f:
623+
elements = partition(file=f, xml_keep_tags=False)
624+
625+
assert elements[0].text == "United States"
626+
627+
628+
def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
629+
elements = partition(filename=filename, xml_keep_tags=True)
630+
631+
assert elements[5].text == "<name>United States</name>"
632+
assert elements[5].metadata.filename == "factbook.xml"
633+
634+
635+
def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
636+
with open(filename, "rb") as f:
637+
elements = partition(file=f, xml_keep_tags=True)
638+
639+
assert elements[5].text == "<name>United States</name>"
640+
641+
616642
EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
617643
<tbody>
618644
<tr>
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from unstructured.partition.xml import partition_xml
2+
3+
4+
def test_partition_xml_from_filename(filename="example-docs/factbook.xml"):
5+
elements = partition_xml(filename=filename, xml_keep_tags=False)
6+
7+
assert elements[0].text == "United States"
8+
assert elements[0].metadata.filename == "factbook.xml"
9+
10+
11+
def test_partition_xml_from_file(filename="example-docs/factbook.xml"):
12+
with open(filename, "rb") as f:
13+
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=filename)
14+
15+
assert elements[0].text == "United States"
16+
assert elements[0].metadata.filename == "factbook.xml"
17+
18+
19+
def test_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
20+
elements = partition_xml(filename=filename, xml_keep_tags=True)
21+
22+
assert elements[5].text == "<name>United States</name>"
23+
assert elements[5].metadata.filename == "factbook.xml"
24+
25+
26+
def test_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
27+
with open(filename, "rb") as f:
28+
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=filename)
29+
30+
assert elements[5].text == "<name>United States</name>"
31+
assert elements[5].metadata.filename == "factbook.xml"

unstructured/partition/auto.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from unstructured.partition.rtf import partition_rtf
2828
from unstructured.partition.text import partition_text
2929
from unstructured.partition.xlsx import partition_xlsx
30+
from unstructured.partition.xml import partition_xml
3031

3132

3233
def partition(
@@ -43,6 +44,7 @@ def partition(
4344
ssl_verify: bool = True,
4445
ocr_languages: str = "eng",
4546
pdf_infer_table_structure: bool = False,
47+
xml_keep_tags: bool = False,
4648
):
4749
"""Partitions a document into its constituent elements. Will use libmagic to determine
4850
the file's type and route it to the appropriate partitioning function. Applies the default
@@ -83,6 +85,9 @@ def partition(
8385
additional metadata field, "text_as_html," where the value (string) is a just a
8486
transformation of the data into an HTML <table>.
8587
The "text" field for a partitioned Table Element is always present, whether True or False.
88+
xml_keep_tags
89+
If True, will retain the XML tags in the output. Otherwise it will simply extract
90+
the text from within the tags. Only applies to partition_xml.
8691
"""
8792
exactly_one(file=file, filename=filename, url=url)
8893

@@ -126,6 +131,13 @@ def partition(
126131
include_page_breaks=include_page_breaks,
127132
encoding=encoding,
128133
)
134+
elif filetype == FileType.XML:
135+
elements = partition_xml(
136+
filename=filename,
137+
file=file,
138+
encoding=encoding,
139+
xml_keep_tags=xml_keep_tags,
140+
)
129141
elif filetype == FileType.EPUB:
130142
elements = partition_epub(
131143
filename=filename,

unstructured/partition/text.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ def partition_text(
3333
text: Optional[str] = None,
3434
encoding: Optional[str] = "utf-8",
3535
paragraph_grouper: Optional[Callable[[str], str]] = None,
36+
metadata_filename: Optional[str] = None,
37+
include_metadata: bool = True,
3638
) -> List[Element]:
3739
"""Partitions an .txt documents into its constituent elements.
3840
Parameters
@@ -48,6 +50,8 @@ def partition_text(
4850
paragrapher_grouper
4951
A str -> str function for fixing paragraphs that are interrupted by line breaks
5052
for formatting purposes.
53+
include_metadata
54+
Determines whether or not metadata is included in the output.
5155
"""
5256
if text is not None and text.strip() == "" and not file and not filename:
5357
return []
@@ -77,8 +81,12 @@ def partition_text(
7781

7882
file_content = split_by_paragraph(file_text)
7983

84+
metadata_filename = metadata_filename or filename
85+
8086
elements: List[Element] = []
81-
metadata = ElementMetadata(filename=filename)
87+
metadata = (
88+
ElementMetadata(filename=metadata_filename) if include_metadata else ElementMetadata()
89+
)
8290
for ctext in file_content:
8391
ctext = ctext.strip()
8492

0 commit comments

Comments
 (0)