refactor: update detect_filetype() to use hashmap for mime type return (#591)

marcusyatim · MthwRobinson · web-flow · commit 7eac1f8ca77a · 2023-05-17T13:48:52.000Z
* Update detect_filetype() to use hashmap for mime type return

* fix: text mime type and linting

* fix: declare docx and xlsx mime types locally and also fix linting

* Update CHANGELOG.md

* tweaks for failing tests

---------

Co-authored-by: Matt Robinson &lt;mrobinson@unstructuredai.io&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@
   partition strategy in CLI. For example, `--partition-strategy fast`.
 * Added metadata for filetype.
 * Add Discord connector to pull messages from a list of channels
+* Refactor `unstructured/file-utils/filetype.py` to better utilise hashmap to return mime type.
+* Add local declaration of DOCX_MIME_TYPES and XLSX_MIME_TYPES for `test_filetype.py`.
 
 ### Features
 
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -7,8 +7,6 @@
 
 from unstructured.file_utils import filetype
 from unstructured.file_utils.filetype import (
-    DOCX_MIME_TYPES,
-    XLSX_MIME_TYPES,
     FileType,
     _is_text_file_a_json,
     detect_filetype,
@@ -17,6 +15,14 @@
 FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
 EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs")
 
+DOCX_MIME_TYPES = [
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+]
+
+XLSX_MIME_TYPES = [
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+]
+
 
 @pytest.mark.parametrize(
     ("file", "expected"),
@@ -142,15 +148,23 @@ def test_detect_html_text_xml(monkeypatch):
 
 
 def test_detect_docx_filetype_application_octet_stream(monkeypatch):
-    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_buffer",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
     with open(filename, "rb") as f:
         filetype = detect_filetype(file=f)
     assert filetype == FileType.DOCX
 
 
 def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
     filetype = detect_filetype(filename=filename)
     assert filetype == FileType.DOCX
@@ -173,55 +187,87 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):
 
 
 def test_detect_doc_file_from_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/msword",
+    )
     filetype = detect_filetype(filename="fake.doc")
     assert filetype == FileType.DOC
 
 
 def test_detect_ppt_file_from_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/vnd.ms-powerpoint",
+    )
     filetype = detect_filetype(filename="fake.ppt")
     assert filetype == FileType.PPT
 
 
 def test_detect_xls_file_from_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/vnd.ms-excel",
+    )
     filetype = detect_filetype(filename="fake.xls")
     assert filetype == FileType.XLS
 
 
 def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
-    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_buffer",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
     with open(filename, "rb") as f:
         filetype = detect_filetype(file=f)
     assert filetype == FileType.XLSX
 
 
 def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
     filetype = detect_filetype(filename=filename)
     assert filetype == FileType.XLSX
 
 
 def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
-    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_buffer",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
     with open(filename, "rb") as f:
         filetype = detect_filetype(file=f)
     assert filetype == FileType.PPTX
 
 
 def test_detect_pptx_filetype_application_octet_stream_with_filename(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
     filetype = detect_filetype(filename=filename)
     assert filetype == FileType.PPTX
 
 
 def test_detect_application_octet_stream_returns_none_with_unknown(monkeypatch):
-    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_buffer",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
     with open(filename, "rb") as f:
         filetype = detect_filetype(file=f)
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -20,54 +20,11 @@
 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_HEAD_RE
 
-DOCX_MIME_TYPES = [
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-]
-
-DOC_MIME_TYPES = [
-    "application/msword",
-]
-
-ODT_MIME_TYPES = [
-    "application/vnd.oasis.opendocument.text",
-]
-
-XLSX_MIME_TYPES = [
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-]
-
-XLS_MIME_TYPES = [
-    "application/vnd.ms-excel",
-]
-
-PPTX_MIME_TYPES = [
-    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-]
-
-PPT_MIME_TYPES = [
-    "application/vnd.ms-powerpoint",
-]
-
-MSG_MIME_TYPES = [
-    "application/vnd.ms-outlook",
-    "application/x-ole-storage",
-]
-
 TXT_MIME_TYPES = [
     "text/plain",
     "message/rfc822",  # ref: https://www.rfc-editor.org/rfc/rfc822
 ]
 
-MD_MIME_TYPES = [
-    "text/markdown",
-    "text/x-markdown",
-]
-
-EPUB_MIME_TYPES = [
-    "application/epub",
-    "application/epub+zip",
-]
-
 # NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
 # If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
 # looking for expected filenames within the zip file.
@@ -141,6 +98,7 @@ def __lt__(self, other):
     "application/epub+zip": FileType.EPUB,
     "application/json": FileType.JSON,
     "application/rtf": FileType.RTF,
+    "text/rtf": FileType.RTF,
     "text/html": FileType.HTML,
     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
     "application/vnd.ms-excel": FileType.XLS,
@@ -149,6 +107,7 @@ def __lt__(self, other):
     "application/xml": FileType.XML,
     "application/vnd.oasis.opendocument.text": FileType.ODT,
     "message/rfc822": FileType.EML,
+    "application/x-ole-storage": FileType.MSG,
     "application/vnd.ms-outlook": FileType.MSG,
 }
 
@@ -206,13 +165,9 @@ def detect_filetype(
         extension = extension.lower()
         if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE:
             mime_type = magic.from_file(filename or file_filename, mime=True)  # type: ignore
-            # NOTE(crag): for older versions of the OS libmagic package, such as is currently
-            # installed on the Unstructured docker image, .json files resolve to "text/plain"
-            # rather than "application/json". this corrects for that case.
-            if mime_type == "text/plain" and extension == ".json":
-                return FileType.JSON
         else:
             return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
+
     elif file is not None:
         extension = None
         # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
@@ -229,77 +184,41 @@ def detect_filetype(
     else:
         raise ValueError("No filename, file, nor file_filename were specified.")
 
-    if mime_type == "application/pdf":
-        return FileType.PDF
+    """Mime type special cases."""
 
-    elif mime_type == "application/json":
+    # NOTE(crag): for older versions of the OS libmagic package, such as is currently
+    # installed on the Unstructured docker image, .json files resolve to "text/plain"
+    # rather than "application/json". this corrects for that case.
+    if mime_type == "text/plain" and extension == ".json":
         return FileType.JSON
 
-    elif mime_type in DOCX_MIME_TYPES:
-        return FileType.DOCX
-
-    elif mime_type in DOC_MIME_TYPES:
-        return FileType.DOC
-
-    elif mime_type in ODT_MIME_TYPES:
-        return FileType.ODT
-
-    elif mime_type in MSG_MIME_TYPES:
-        return FileType.MSG
-
-    elif mime_type == "image/jpeg":
-        return FileType.JPG
-
-    elif mime_type == "image/png":
-        return FileType.PNG
-
-    elif mime_type in MD_MIME_TYPES:
-        # NOTE - I am not sure whether libmagic ever returns these mimetypes.
-        return FileType.MD
-
-    elif mime_type in EPUB_MIME_TYPES:
-        return FileType.EPUB
-
-    # NOTE(robinson) - examples are application/rtf or text/rtf.
-    # magic often returns text/plain for RTF files
-    elif mime_type.endswith("rtf"):
-        return FileType.RTF
-
     elif mime_type.endswith("xml"):
         if extension and (extension == ".html" or extension == ".htm"):
             return FileType.HTML
         else:
             return FileType.XML
 
-    elif mime_type == "text/html":
-        return FileType.HTML
-
     elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
         if extension and extension == ".eml":
             return FileType.EML
         elif extension and extension == ".md":
             return FileType.MD
         elif extension and extension == ".rtf":
             return FileType.RTF
+        elif extension and extension == ".html":
+            return FileType.HTML
 
         if _is_text_file_a_json(file=file, filename=filename):
             return FileType.JSON
 
         if file and not extension and _check_eml_from_buffer(file=file) is True:
             return FileType.EML
-        return FileType.TXT
 
-    elif mime_type in XLSX_MIME_TYPES:
-        return FileType.XLSX
+        # Safety catch
+        if mime_type in STR_TO_FILETYPE:
+            return STR_TO_FILETYPE[mime_type]
 
-    elif mime_type in XLS_MIME_TYPES:
-        return FileType.XLS
-
-    elif mime_type in PPTX_MIME_TYPES:
-        return FileType.PPTX
-
-    elif mime_type in PPT_MIME_TYPES:
-        return FileType.PPT
+        return FileType.TXT
 
     elif mime_type == "application/octet-stream":
         if file and not extension:
@@ -321,6 +240,10 @@ def detect_filetype(
         else:
             return EXT_TO_FILETYPE.get(extension.lower(), filetype)
 
+    # For everything else
+    elif mime_type in STR_TO_FILETYPE:
+        return STR_TO_FILETYPE[mime_type]
+
     logger.warning(
         f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. "
         "This file type is not currently supported in unstructured.",