Skip to content

Commit 5646f38

Browse files
fix: update unstructured_parser.py to work with unstructured 0.17.2
Co-Authored-By: Aaron <AJ> Steers <[email protected]>
1 parent 1ff8b81 commit 5646f38

File tree

1 file changed

+10
-8
lines changed

1 file changed

+10
-8
lines changed

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,10 @@
1010

1111
import backoff
1212
import dpath
13+
import mimetypes
1314
import nltk
1415
import requests
1516
from unstructured.file_utils.filetype import (
16-
EXT_TO_FILETYPE,
17-
FILETYPE_TO_MIMETYPE,
18-
STR_TO_FILETYPE,
1917
FileType,
2018
detect_filetype,
2119
)
@@ -335,7 +333,8 @@ def _read_file_remotely(
335333

336334
data = self._params_to_dict(format.parameters, strategy)
337335

338-
file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
336+
mime_type = mimetypes.guess_type(f"file.{filetype.name.lower()}")[0] if filetype else "application/octet-stream"
337+
file_data = {"files": ("filename", file_handle, mime_type)}
339338

340339
response = requests.post(
341340
f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
@@ -405,8 +404,10 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
405404
2. Use the file name if available
406405
3. Use the file content
407406
"""
408-
if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE:
409-
return STR_TO_FILETYPE[remote_file.mime_type]
407+
if remote_file.mime_type:
408+
for file_type in FileType:
409+
if mimetypes.guess_type(f"file.{file_type.name.lower()}")[0] == remote_file.mime_type:
410+
return file_type
410411

411412
# set name to none, otherwise unstructured will try to get the modified date from the local file system
412413
if hasattr(file, "name"):
@@ -434,8 +435,9 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
434435
return type_based_on_content
435436

436437
extension = "." + remote_file.uri.split(".")[-1].lower()
437-
if extension in EXT_TO_FILETYPE:
438-
return EXT_TO_FILETYPE[extension]
438+
for file_type in FileType:
439+
if file_type.name.lower() == extension[1:].lower():
440+
return file_type
439441

440442
return None
441443

0 commit comments

Comments
 (0)