Skip to content

Commit 7eac1f8

Browse files
refactor: update detect_filetype() to use hashmap for mime type return (#591)
* Update detect_filetype() to use hashmap for mime type return * fix: text mime type and linting * fix: declare docx and xlsx mime types locally and also fix linting * Update CHANGELOG.md * tweaks for failing tests --------- Co-authored-by: Matt Robinson <[email protected]>
1 parent f4f40f5 commit 7eac1f8

File tree

3 files changed

+78
-107
lines changed

3 files changed

+78
-107
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
partition strategy in CLI. For example, `--partition-strategy fast`.
88
* Added metadata for filetype.
99
* Add Discord connector to pull messages from a list of channels
10+
* Refactor `unstructured/file-utils/filetype.py` to better utilise hashmap to return mime type.
11+
* Add local declaration of DOCX_MIME_TYPES and XLSX_MIME_TYPES for `test_filetype.py`.
1012

1113
### Features
1214

test_unstructured/file_utils/test_filetype.py

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77

88
from unstructured.file_utils import filetype
99
from unstructured.file_utils.filetype import (
10-
DOCX_MIME_TYPES,
11-
XLSX_MIME_TYPES,
1210
FileType,
1311
_is_text_file_a_json,
1412
detect_filetype,
@@ -17,6 +15,14 @@
1715
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
1816
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs")
1917

18+
DOCX_MIME_TYPES = [
19+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
20+
]
21+
22+
XLSX_MIME_TYPES = [
23+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
24+
]
25+
2026

2127
@pytest.mark.parametrize(
2228
("file", "expected"),
@@ -142,15 +148,23 @@ def test_detect_html_text_xml(monkeypatch):
142148

143149

144150
def test_detect_docx_filetype_application_octet_stream(monkeypatch):
145-
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
151+
monkeypatch.setattr(
152+
magic,
153+
"from_buffer",
154+
lambda *args, **kwargs: "application/octet-stream",
155+
)
146156
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
147157
with open(filename, "rb") as f:
148158
filetype = detect_filetype(file=f)
149159
assert filetype == FileType.DOCX
150160

151161

152162
def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch):
153-
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
163+
monkeypatch.setattr(
164+
magic,
165+
"from_file",
166+
lambda *args, **kwargs: "application/octet-stream",
167+
)
154168
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
155169
filetype = detect_filetype(filename=filename)
156170
assert filetype == FileType.DOCX
@@ -173,55 +187,87 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):
173187

174188

175189
def test_detect_doc_file_from_mime_type(monkeypatch):
176-
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
190+
monkeypatch.setattr(
191+
magic,
192+
"from_file",
193+
lambda *args, **kwargs: "application/msword",
194+
)
177195
filetype = detect_filetype(filename="fake.doc")
178196
assert filetype == FileType.DOC
179197

180198

181199
def test_detect_ppt_file_from_mime_type(monkeypatch):
182-
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
200+
monkeypatch.setattr(
201+
magic,
202+
"from_file",
203+
lambda *args, **kwargs: "application/vnd.ms-powerpoint",
204+
)
183205
filetype = detect_filetype(filename="fake.ppt")
184206
assert filetype == FileType.PPT
185207

186208

187209
def test_detect_xls_file_from_mime_type(monkeypatch):
188-
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
210+
monkeypatch.setattr(
211+
magic,
212+
"from_file",
213+
lambda *args, **kwargs: "application/vnd.ms-excel",
214+
)
189215
filetype = detect_filetype(filename="fake.xls")
190216
assert filetype == FileType.XLS
191217

192218

193219
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
194-
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
220+
monkeypatch.setattr(
221+
magic,
222+
"from_buffer",
223+
lambda *args, **kwargs: "application/octet-stream",
224+
)
195225
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
196226
with open(filename, "rb") as f:
197227
filetype = detect_filetype(file=f)
198228
assert filetype == FileType.XLSX
199229

200230

201231
def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
202-
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
232+
monkeypatch.setattr(
233+
magic,
234+
"from_file",
235+
lambda *args, **kwargs: "application/octet-stream",
236+
)
203237
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
204238
filetype = detect_filetype(filename=filename)
205239
assert filetype == FileType.XLSX
206240

207241

208242
def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
209-
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
243+
monkeypatch.setattr(
244+
magic,
245+
"from_buffer",
246+
lambda *args, **kwargs: "application/octet-stream",
247+
)
210248
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
211249
with open(filename, "rb") as f:
212250
filetype = detect_filetype(file=f)
213251
assert filetype == FileType.PPTX
214252

215253

216254
def test_detect_pptx_filetype_application_octet_stream_with_filename(monkeypatch):
217-
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
255+
monkeypatch.setattr(
256+
magic,
257+
"from_file",
258+
lambda *args, **kwargs: "application/octet-stream",
259+
)
218260
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
219261
filetype = detect_filetype(filename=filename)
220262
assert filetype == FileType.PPTX
221263

222264

223265
def test_detect_application_octet_stream_returns_none_with_unknown(monkeypatch):
224-
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
266+
monkeypatch.setattr(
267+
magic,
268+
"from_buffer",
269+
lambda *args, **kwargs: "application/octet-stream",
270+
)
225271
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
226272
with open(filename, "rb") as f:
227273
filetype = detect_filetype(file=f)

unstructured/file_utils/filetype.py

Lines changed: 18 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -20,54 +20,11 @@
2020
from unstructured.logger import logger
2121
from unstructured.nlp.patterns import EMAIL_HEAD_RE
2222

23-
DOCX_MIME_TYPES = [
24-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
25-
]
26-
27-
DOC_MIME_TYPES = [
28-
"application/msword",
29-
]
30-
31-
ODT_MIME_TYPES = [
32-
"application/vnd.oasis.opendocument.text",
33-
]
34-
35-
XLSX_MIME_TYPES = [
36-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
37-
]
38-
39-
XLS_MIME_TYPES = [
40-
"application/vnd.ms-excel",
41-
]
42-
43-
PPTX_MIME_TYPES = [
44-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
45-
]
46-
47-
PPT_MIME_TYPES = [
48-
"application/vnd.ms-powerpoint",
49-
]
50-
51-
MSG_MIME_TYPES = [
52-
"application/vnd.ms-outlook",
53-
"application/x-ole-storage",
54-
]
55-
5623
TXT_MIME_TYPES = [
5724
"text/plain",
5825
"message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822
5926
]
6027

61-
MD_MIME_TYPES = [
62-
"text/markdown",
63-
"text/x-markdown",
64-
]
65-
66-
EPUB_MIME_TYPES = [
67-
"application/epub",
68-
"application/epub+zip",
69-
]
70-
7128
# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
7229
# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
7330
# looking for expected filenames within the zip file.
@@ -141,6 +98,7 @@ def __lt__(self, other):
14198
"application/epub+zip": FileType.EPUB,
14299
"application/json": FileType.JSON,
143100
"application/rtf": FileType.RTF,
101+
"text/rtf": FileType.RTF,
144102
"text/html": FileType.HTML,
145103
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
146104
"application/vnd.ms-excel": FileType.XLS,
@@ -149,6 +107,7 @@ def __lt__(self, other):
149107
"application/xml": FileType.XML,
150108
"application/vnd.oasis.opendocument.text": FileType.ODT,
151109
"message/rfc822": FileType.EML,
110+
"application/x-ole-storage": FileType.MSG,
152111
"application/vnd.ms-outlook": FileType.MSG,
153112
}
154113

@@ -206,13 +165,9 @@ def detect_filetype(
206165
extension = extension.lower()
207166
if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE:
208167
mime_type = magic.from_file(filename or file_filename, mime=True) # type: ignore
209-
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
210-
# installed on the Unstructured docker image, .json files resolve to "text/plain"
211-
# rather than "application/json". this corrects for that case.
212-
if mime_type == "text/plain" and extension == ".json":
213-
return FileType.JSON
214168
else:
215169
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
170+
216171
elif file is not None:
217172
extension = None
218173
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
@@ -229,77 +184,41 @@ def detect_filetype(
229184
else:
230185
raise ValueError("No filename, file, nor file_filename were specified.")
231186

232-
if mime_type == "application/pdf":
233-
return FileType.PDF
187+
"""Mime type special cases."""
234188

235-
elif mime_type == "application/json":
189+
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
190+
# installed on the Unstructured docker image, .json files resolve to "text/plain"
191+
# rather than "application/json". this corrects for that case.
192+
if mime_type == "text/plain" and extension == ".json":
236193
return FileType.JSON
237194

238-
elif mime_type in DOCX_MIME_TYPES:
239-
return FileType.DOCX
240-
241-
elif mime_type in DOC_MIME_TYPES:
242-
return FileType.DOC
243-
244-
elif mime_type in ODT_MIME_TYPES:
245-
return FileType.ODT
246-
247-
elif mime_type in MSG_MIME_TYPES:
248-
return FileType.MSG
249-
250-
elif mime_type == "image/jpeg":
251-
return FileType.JPG
252-
253-
elif mime_type == "image/png":
254-
return FileType.PNG
255-
256-
elif mime_type in MD_MIME_TYPES:
257-
# NOTE - I am not sure whether libmagic ever returns these mimetypes.
258-
return FileType.MD
259-
260-
elif mime_type in EPUB_MIME_TYPES:
261-
return FileType.EPUB
262-
263-
# NOTE(robinson) - examples are application/rtf or text/rtf.
264-
# magic often returns text/plain for RTF files
265-
elif mime_type.endswith("rtf"):
266-
return FileType.RTF
267-
268195
elif mime_type.endswith("xml"):
269196
if extension and (extension == ".html" or extension == ".htm"):
270197
return FileType.HTML
271198
else:
272199
return FileType.XML
273200

274-
elif mime_type == "text/html":
275-
return FileType.HTML
276-
277201
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
278202
if extension and extension == ".eml":
279203
return FileType.EML
280204
elif extension and extension == ".md":
281205
return FileType.MD
282206
elif extension and extension == ".rtf":
283207
return FileType.RTF
208+
elif extension and extension == ".html":
209+
return FileType.HTML
284210

285211
if _is_text_file_a_json(file=file, filename=filename):
286212
return FileType.JSON
287213

288214
if file and not extension and _check_eml_from_buffer(file=file) is True:
289215
return FileType.EML
290-
return FileType.TXT
291216

292-
elif mime_type in XLSX_MIME_TYPES:
293-
return FileType.XLSX
217+
# Safety catch
218+
if mime_type in STR_TO_FILETYPE:
219+
return STR_TO_FILETYPE[mime_type]
294220

295-
elif mime_type in XLS_MIME_TYPES:
296-
return FileType.XLS
297-
298-
elif mime_type in PPTX_MIME_TYPES:
299-
return FileType.PPTX
300-
301-
elif mime_type in PPT_MIME_TYPES:
302-
return FileType.PPT
221+
return FileType.TXT
303222

304223
elif mime_type == "application/octet-stream":
305224
if file and not extension:
@@ -321,6 +240,10 @@ def detect_filetype(
321240
else:
322241
return EXT_TO_FILETYPE.get(extension.lower(), filetype)
323242

243+
# For everything else
244+
elif mime_type in STR_TO_FILETYPE:
245+
return STR_TO_FILETYPE[mime_type]
246+
324247
logger.warning(
325248
f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. "
326249
"This file type is not currently supported in unstructured.",

0 commit comments

Comments
 (0)