|
13 | 13 | import mimetypes |
14 | 14 | import nltk |
15 | 15 | import requests |
16 | | -from unstructured.file_utils.filetype import ( |
17 | | - FileType, |
18 | | - detect_filetype, |
19 | | -) |
| 16 | +from unstructured.file_utils.filetype import FileType, detect_filetype |
20 | 17 |
|
21 | 18 | from airbyte_cdk.models import FailureType |
22 | 19 | from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig |
@@ -334,10 +331,10 @@ def _read_file_remotely( |
334 | 331 | data = self._params_to_dict(format.parameters, strategy) |
335 | 332 |
|
336 | 333 | mime_type = mimetypes.guess_type(f"file.{filetype.name.lower()}")[0] if filetype else "application/octet-stream" |
337 | | - file_data = {"files": ("filename", file_handle, mime_type)} |
| 334 | + files = {"files": ("filename", file_handle, mime_type)} |
338 | 335 |
|
339 | 336 | response = requests.post( |
340 | | - f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data |
| 337 | + f"{format.api_url}/general/v0/general", headers=headers, data=data, files=files |
341 | 338 | ) |
342 | 339 |
|
343 | 340 | if response.status_code == 422: |
@@ -416,17 +413,17 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT |
416 | 413 | # detect_filetype is either using the file name or file content |
417 | 414 | # if possible, try to leverage the file name to detect the file type |
418 | 415 | # if the file name is not available, use the file content |
419 | | - file_type: FileType | None = None |
| 416 | + detected_type: FileType | None = None |
420 | 417 | try: |
421 | | - file_type = detect_filetype( |
| 418 | + detected_type = detect_filetype( |
422 | 419 | filename=remote_file.uri, |
423 | 420 | ) |
424 | 421 | except Exception: |
425 | 422 | # Path doesn't exist locally. Try something else... |
426 | 423 | pass |
427 | 424 |
|
428 | | - if file_type and file_type != FileType.UNK: |
429 | | - return file_type |
| 425 | + if detected_type and detected_type != FileType.UNK: |
| 426 | + return detected_type |
430 | 427 |
|
431 | 428 | type_based_on_content = detect_filetype(file=file) |
432 | 429 | file.seek(0) # detect_filetype is reading to read the file content, so we need to reset |
|
0 commit comments