|
10 | 10 |
|
11 | 11 | import backoff |
12 | 12 | import dpath |
| 13 | +import mimetypes |
13 | 14 | import nltk |
14 | 15 | import requests |
15 | 16 | from unstructured.file_utils.filetype import ( |
16 | | - EXT_TO_FILETYPE, |
17 | | - FILETYPE_TO_MIMETYPE, |
18 | | - STR_TO_FILETYPE, |
19 | 17 | FileType, |
20 | 18 | detect_filetype, |
21 | 19 | ) |
@@ -335,7 +333,8 @@ def _read_file_remotely( |
335 | 333 |
|
336 | 334 | data = self._params_to_dict(format.parameters, strategy) |
337 | 335 |
|
338 | | - file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])} |
| 336 | + mime_type = mimetypes.guess_type(f"file.{filetype.name.lower()}")[0] if filetype else "application/octet-stream" |
| 337 | + file_data = {"files": ("filename", file_handle, mime_type)} |
339 | 338 |
|
340 | 339 | response = requests.post( |
341 | 340 | f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data |
@@ -405,8 +404,10 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT |
405 | 404 | 2. Use the file name if available |
406 | 405 | 3. Use the file content |
407 | 406 | """ |
408 | | - if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE: |
409 | | - return STR_TO_FILETYPE[remote_file.mime_type] |
| 407 | + if remote_file.mime_type: |
| 408 | + for file_type in FileType: |
| 409 | + if mimetypes.guess_type(f"file.{file_type.name.lower()}")[0] == remote_file.mime_type: |
| 410 | + return file_type |
410 | 411 |
|
411 | 412 | # set name to none, otherwise unstructured will try to get the modified date from the local file system |
412 | 413 | if hasattr(file, "name"): |
@@ -434,8 +435,9 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT |
434 | 435 | return type_based_on_content |
435 | 436 |
|
436 | 437 | extension = "." + remote_file.uri.split(".")[-1].lower() |
437 | | - if extension in EXT_TO_FILETYPE: |
438 | | - return EXT_TO_FILETYPE[extension] |
| 438 | + for file_type in FileType: |
| 439 | + if file_type.name.lower() == extension[1:].lower(): |
| 440 | + return file_type |
439 | 441 |
|
440 | 442 | return None |
441 | 443 |
|
|
0 commit comments