feat: add partition_xlsx for MSFT Excel files (#594)

MthwRobinson · web-flow · commit b8037118c4db · 2023-05-16T19:40:40.000Z
* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@
 
 ### Features
 
+* Add `partition_xlsx` for Microsoft Excel documents.
+
 ### Fixes
 
 * Supports `hml` filetype for partition as a variation of html filetype.
diff --git a/README.md b/README.md
@@ -183,7 +183,7 @@ You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCj
 
 The following examples show how to get started with the `unstructured` library.
 You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
-**ODT**, **PPT**, **PPTX**, **JPG**,
+**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**,
 and **PNG** documents with one line of code!
 <br></br>
 See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
@@ -198,7 +198,7 @@ If you are using the `partition` brick, you may need to install additional param
 instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
 `partition` will always apply the default arguments. If you need
 advanced features, use a document-specific brick. The `partition` brick currently works for
-`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents.
+`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.xlsx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents.
 
 ```python
 from unstructured.partition.auto import partition
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the default kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
 If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
 ``.png``, and ``.jpg``.
@@ -251,6 +251,24 @@ Examples:
   elements = partition_doc(filename="example-docs/fake.doc")
 
 
+``partition_xlsx``
+------------------
+
+The ``partition_xlsx`` function pre-processes Microsoft Excel documents. Each
+sheet in the Excel file will be stored as a ``Table`` object. The plain text
+of the sheet will be the ``text`` attribute of the ``Table``. The ``text_as_html``
+attribute in the element metadata will contain an HTML representation of the table.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.xlsx import partition_xlsx
+
+  elements = partition_xlsx(filename="example-docs/stanley-cups.xlsx")
+  print(elements[0].metadata.text_as_html)
+
+
 ``partition_odt``
 ------------------
 
diff --git a/example-docs/stanley-cups.xlsx b/example-docs/stanley-cups.xlsx
diff --git a/example-docs/unsupported/fake-excel.xlsx b/example-docs/unsupported/fake-excel.xlsx
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -29,7 +29,7 @@
         ("unsupported/factbook.xml", FileType.XML),
         ("example-10k.html", FileType.HTML),
         ("fake-html.html", FileType.HTML),
-        ("unsupported/fake-excel.xlsx", FileType.XLSX),
+        ("stanley-cups.xlsx", FileType.XLSX),
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
         ("spring-weather.html.json", FileType.JSON),
@@ -52,7 +52,7 @@ def test_detect_filetype_from_filename(file, expected):
         ("unsupported/factbook.xml", FileType.XML),
         ("example-10k.html", FileType.HTML),
         ("fake-html.html", FileType.HTML),
-        ("unsupported/fake-excel.xlsx", FileType.XLSX),
+        ("stanley-cups.xlsx", FileType.XLSX),
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
         ("fake-doc.rtf", FileType.RTF),
@@ -87,7 +87,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
         # */xml and some return */html. Either could be acceptable depending on the OS
         ("example-10k.html", [FileType.HTML, FileType.XML]),
         ("fake-html.html", FileType.HTML),
-        ("unsupported/fake-excel.xlsx", FileType.XLSX),
+        ("stanley-cups.xlsx", FileType.XLSX),
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
     ],
@@ -192,15 +192,15 @@ def test_detect_xls_file_from_mime_type(monkeypatch):
 
 def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
     monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
     with open(filename, "rb") as f:
         filetype = detect_filetype(file=f)
     assert filetype == FileType.XLSX
 
 
 def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
     monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
     filetype = detect_filetype(filename=filename)
     assert filetype == FileType.XLSX
 
@@ -246,7 +246,7 @@ def test_detect_docx_filetype_word_mime_type(monkeypatch):
 
 def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
     monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0])
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
     with open(filename, "rb") as f:
         filetype = detect_filetype(file=f)
     assert filetype == FileType.XLSX
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -9,12 +9,14 @@
 import pypandoc
 import pytest
 
+from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import (
     Address,
     ElementMetadata,
     ListItem,
     NarrativeText,
     PageBreak,
+    Table,
     Text,
     Title,
 )
@@ -609,3 +611,59 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
             elements = fun(str(file))
             assert all(el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] for el in elements)
             break
+
+
+EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
+  <tbody>
+    <tr>
+      <td>Team</td>
+      <td>Location</td>
+      <td>Stanley Cups</td>
+    </tr>
+    <tr>
+      <td>Blues</td>
+      <td>STL</td>
+      <td>1</td>
+    </tr>
+    <tr>
+      <td>Flyers</td>
+      <td>PHI</td>
+      <td>2</td>
+    </tr>
+    <tr>
+      <td>Maple Leafs</td>
+      <td>TOR</td>
+      <td>13</td>
+    </tr>
+  </tbody>
+</table>"""
+
+
+EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
+
+EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+
+
+def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
+    elements = partition(filename=filename)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.page_number == 1
+    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
+
+
+def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
+    with open(filename, "rb") as f:
+        elements = partition(file=f)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.page_number == 1
+    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py
@@ -0,0 +1,70 @@
+from unstructured.cleaners.core import clean_extra_whitespace
+from unstructured.documents.elements import Table
+from unstructured.partition.xlsx import partition_xlsx
+
+EXPECTED_TABLE = """<table border="1" class="dataframe">
+  <tbody>
+    <tr>
+      <td>Team</td>
+      <td>Location</td>
+      <td>Stanley Cups</td>
+    </tr>
+    <tr>
+      <td>Blues</td>
+      <td>STL</td>
+      <td>1</td>
+    </tr>
+    <tr>
+      <td>Flyers</td>
+      <td>PHI</td>
+      <td>2</td>
+    </tr>
+    <tr>
+      <td>Maple Leafs</td>
+      <td>TOR</td>
+      <td>13</td>
+    </tr>
+  </tbody>
+</table>"""
+
+
+EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
+
+EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+
+
+def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
+    elements = partition_xlsx(filename=filename)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert elements[0].metadata.page_number == 1
+    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
+
+
+def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
+    with open(filename, "rb") as f:
+        elements = partition_xlsx(file=f)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert elements[0].metadata.page_number == 1
+    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
+
+
+def test_partition_xlsx_can_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
+    elements = partition_xlsx(filename=filename, include_metadata=False)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert elements[0].metadata.text_as_html is None
+    assert elements[0].metadata.page_number is None
+    assert elements[0].metadata.filetype is None
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -77,7 +77,6 @@
 ]
 
 EXPECTED_XLSX_FILES = [
-    "docProps/core.xml",
     "xl/workbook.xml",
 ]
 
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -26,6 +26,7 @@
 from unstructured.partition.pptx import partition_pptx
 from unstructured.partition.rtf import partition_rtf
 from unstructured.partition.text import partition_text
+from unstructured.partition.xlsx import partition_xlsx
 
 
 def partition(
@@ -183,6 +184,8 @@ def partition(
         )
     elif filetype == FileType.JSON:
         elements = partition_json(filename=filename, file=file)
+    elif filetype == FileType.XLSX:
+        elements = partition_xlsx(filename=filename, file=file)
     else:
         msg = "Invalid file" if not filename else f"Invalid file {filename}"
         raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
@@ -0,0 +1,63 @@
+from tempfile import SpooledTemporaryFile
+from typing import IO, BinaryIO, List, Optional, Union, cast
+
+import lxml.html
+import pandas as pd
+
+from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
+from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
+
+
+@add_metadata_with_filetype(FileType.XLSX)
+def partition_xlsx(
+    filename: Optional[str] = None,
+    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
+    metadata_filename: Optional[str] = None,
+    include_metadata: bool = True,
+) -> List[Element]:
+    """Partitions Microsoft Excel Documents in .xlsx format into its document elements.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    metadata_filename
+        The filename to use for the metadata. Relevant because partition_doc converts the
+        document to .xlsx before partition. We want the original source filename in the
+        metadata.
+    include_metadata
+        Determines whether or not metadata is included in the output.
+    """
+    exactly_one(filename=filename, file=file)
+
+    if filename:
+        sheets = pd.read_excel(filename, sheet_name=None)
+    else:
+        f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
+        sheets = pd.read_excel(f, sheet_name=None)
+
+    metadata_filename = filename or metadata_filename
+
+    elements: List[Element] = []
+    page_number = 0
+    for sheet_name, table in sheets.items():
+        page_number += 1
+        html_text = table.to_html(index=False, header=False, na_rep="")
+        text = lxml.html.document_fromstring(html_text).text_content()
+
+        if include_metadata:
+            metadata = ElementMetadata(
+                text_as_html=html_text,
+                page_number=page_number,
+                filename=metadata_filename,
+            )
+        else:
+            metadata = ElementMetadata()
+
+        table = Table(text=text, metadata=metadata)
+        elements.append(table)
+
+    return elements

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,6 @@`
`77`	`77`	`]`
`78`	`78`
`79`	`79`	`EXPECTED_XLSX_FILES = [`
`80`		`- "docProps/core.xml",`
`81`	`80`	`"xl/workbook.xml",`
`82`	`81`	`]`
`83`	`82`