2020from unstructured .logger import logger
2121from unstructured .nlp .patterns import EMAIL_HEAD_RE
2222
23- DOCX_MIME_TYPES = [
24- "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ,
25- ]
26-
27- DOC_MIME_TYPES = [
28- "application/msword" ,
29- ]
30-
31- ODT_MIME_TYPES = [
32- "application/vnd.oasis.opendocument.text" ,
33- ]
34-
35- XLSX_MIME_TYPES = [
36- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ,
37- ]
38-
39- XLS_MIME_TYPES = [
40- "application/vnd.ms-excel" ,
41- ]
42-
43- PPTX_MIME_TYPES = [
44- "application/vnd.openxmlformats-officedocument.presentationml.presentation" ,
45- ]
46-
47- PPT_MIME_TYPES = [
48- "application/vnd.ms-powerpoint" ,
49- ]
50-
51- MSG_MIME_TYPES = [
52- "application/vnd.ms-outlook" ,
53- "application/x-ole-storage" ,
54- ]
55-
5623TXT_MIME_TYPES = [
5724 "text/plain" ,
5825 "message/rfc822" , # ref: https://www.rfc-editor.org/rfc/rfc822
5926]
6027
61- MD_MIME_TYPES = [
62- "text/markdown" ,
63- "text/x-markdown" ,
64- ]
65-
66- EPUB_MIME_TYPES = [
67- "application/epub" ,
68- "application/epub+zip" ,
69- ]
70-
7128# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
7229# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
7330# looking for expected filenames within the zip file.
@@ -141,6 +98,7 @@ def __lt__(self, other):
14198 "application/epub+zip" : FileType .EPUB ,
14299 "application/json" : FileType .JSON ,
143100 "application/rtf" : FileType .RTF ,
101+ "text/rtf" : FileType .RTF ,
144102 "text/html" : FileType .HTML ,
145103 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" : FileType .XLSX ,
146104 "application/vnd.ms-excel" : FileType .XLS ,
@@ -149,6 +107,7 @@ def __lt__(self, other):
149107 "application/xml" : FileType .XML ,
150108 "application/vnd.oasis.opendocument.text" : FileType .ODT ,
151109 "message/rfc822" : FileType .EML ,
110+ "application/x-ole-storage" : FileType .MSG ,
152111 "application/vnd.ms-outlook" : FileType .MSG ,
153112}
154113
@@ -206,13 +165,9 @@ def detect_filetype(
206165 extension = extension .lower ()
207166 if os .path .isfile (_filename ) and LIBMAGIC_AVAILABLE :
208167 mime_type = magic .from_file (filename or file_filename , mime = True ) # type: ignore
209- # NOTE(crag): for older versions of the OS libmagic package, such as is currently
210- # installed on the Unstructured docker image, .json files resolve to "text/plain"
211- # rather than "application/json". this corrects for that case.
212- if mime_type == "text/plain" and extension == ".json" :
213- return FileType .JSON
214168 else :
215169 return EXT_TO_FILETYPE .get (extension .lower (), FileType .UNK )
170+
216171 elif file is not None :
217172 extension = None
218173 # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
@@ -229,77 +184,41 @@ def detect_filetype(
229184 else :
230185 raise ValueError ("No filename, file, nor file_filename were specified." )
231186
232- if mime_type == "application/pdf" :
233- return FileType .PDF
187+ """Mime type special cases."""
234188
235- elif mime_type == "application/json" :
189+ # NOTE(crag): for older versions of the OS libmagic package, such as is currently
190+ # installed on the Unstructured docker image, .json files resolve to "text/plain"
191+ # rather than "application/json". this corrects for that case.
192+ if mime_type == "text/plain" and extension == ".json" :
236193 return FileType .JSON
237194
238- elif mime_type in DOCX_MIME_TYPES :
239- return FileType .DOCX
240-
241- elif mime_type in DOC_MIME_TYPES :
242- return FileType .DOC
243-
244- elif mime_type in ODT_MIME_TYPES :
245- return FileType .ODT
246-
247- elif mime_type in MSG_MIME_TYPES :
248- return FileType .MSG
249-
250- elif mime_type == "image/jpeg" :
251- return FileType .JPG
252-
253- elif mime_type == "image/png" :
254- return FileType .PNG
255-
256- elif mime_type in MD_MIME_TYPES :
257- # NOTE - I am not sure whether libmagic ever returns these mimetypes.
258- return FileType .MD
259-
260- elif mime_type in EPUB_MIME_TYPES :
261- return FileType .EPUB
262-
263- # NOTE(robinson) - examples are application/rtf or text/rtf.
264- # magic often returns text/plain for RTF files
265- elif mime_type .endswith ("rtf" ):
266- return FileType .RTF
267-
268195 elif mime_type .endswith ("xml" ):
269196 if extension and (extension == ".html" or extension == ".htm" ):
270197 return FileType .HTML
271198 else :
272199 return FileType .XML
273200
274- elif mime_type == "text/html" :
275- return FileType .HTML
276-
277201 elif mime_type in TXT_MIME_TYPES or mime_type .startswith ("text" ):
278202 if extension and extension == ".eml" :
279203 return FileType .EML
280204 elif extension and extension == ".md" :
281205 return FileType .MD
282206 elif extension and extension == ".rtf" :
283207 return FileType .RTF
208+ elif extension and extension == ".html" :
209+ return FileType .HTML
284210
285211 if _is_text_file_a_json (file = file , filename = filename ):
286212 return FileType .JSON
287213
288214 if file and not extension and _check_eml_from_buffer (file = file ) is True :
289215 return FileType .EML
290- return FileType .TXT
291216
292- elif mime_type in XLSX_MIME_TYPES :
293- return FileType .XLSX
217+ # Safety catch
218+ if mime_type in STR_TO_FILETYPE :
219+ return STR_TO_FILETYPE [mime_type ]
294220
295- elif mime_type in XLS_MIME_TYPES :
296- return FileType .XLS
297-
298- elif mime_type in PPTX_MIME_TYPES :
299- return FileType .PPTX
300-
301- elif mime_type in PPT_MIME_TYPES :
302- return FileType .PPT
221+ return FileType .TXT
303222
304223 elif mime_type == "application/octet-stream" :
305224 if file and not extension :
@@ -321,6 +240,10 @@ def detect_filetype(
321240 else :
322241 return EXT_TO_FILETYPE .get (extension .lower (), filetype )
323242
243+ # For everything else
244+ elif mime_type in STR_TO_FILETYPE :
245+ return STR_TO_FILETYPE [mime_type ]
246+
324247 logger .warning (
325248 f"The MIME type{ f' of { filename !r} ' if filename else '' } is { mime_type !r} . "
326249 "This file type is not currently supported in unstructured." ,
0 commit comments