Skip to content

Commit a12b989

Browse files
fix: improve PPTX file type detection for files without extensions
Co-Authored-By: Aaron <AJ> Steers <aj@airbyte.io>
1 parent dfe037f commit a12b989

File tree

1 file changed

+45
-27
lines changed

1 file changed

+45
-27
lines changed

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -174,20 +174,24 @@ def parse_records(
174174
"content": markdown,
175175
"document_key": file.uri,
176176
"_ab_source_file_parse_error": None,
177+
"_ab_source_file_last_modified": file.last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
178+
"_ab_source_file_url": file.uri,
177179
}
178180
except RecordParseError as e:
179181
# RecordParseError is raised when the file can't be parsed because of a problem with the file content (either the file is not supported or the file is corrupted)
180182
# if the skip_unprocessable_files flag is set, we log a warning and pass the error as part of the document
181183
# otherwise, we raise the error to fail the sync
182184
if format.skip_unprocessable_files:
183185
exception_str = str(e)
184-
logger.warn(f"File {file.uri} caused an error during parsing: {exception_str}.")
186+
logger.warning(f"File {file.uri} caused an error during parsing: {exception_str}.")
185187
yield {
186188
"content": None,
187189
"document_key": file.uri,
188190
"_ab_source_file_parse_error": exception_str,
191+
"_ab_source_file_last_modified": file.last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
192+
"_ab_source_file_url": file.uri,
189193
}
190-
logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
194+
logger.warning(f"File {file.uri} cannot be parsed. Skipping it.")
191195
else:
192196
raise e
193197
except Exception as e:
@@ -370,24 +374,25 @@ def _read_file_locally(
370374
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
371375
raise Exception("unstructured library is not available")
372376

373-
file: Any = file_handle
374-
375377
# before the parsing logic is entered, the file is read completely to make sure it is in local memory
376378
file_handle.seek(0)
377-
file_handle.read()
379+
file_content = file_handle.read()
378380
file_handle.seek(0)
379381

380382
try:
381-
if filetype == FileType.PDF:
382-
# for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects
383-
file_handle.seek(0)
384-
with BytesIO(file_handle.read()) as file:
385-
file_handle.seek(0)
383+
# For all file types, create a fresh BytesIO to avoid issues with file-like objects
384+
with BytesIO(file_content) as file:
385+
if filetype == FileType.PDF:
386386
elements = unstructured_partition_pdf(file=file, strategy=strategy)
387-
elif filetype == FileType.DOCX:
388-
elements = unstructured_partition_docx(file=file)
389-
elif filetype == FileType.PPTX:
390-
elements = unstructured_partition_pptx(file=file)
387+
elif filetype == FileType.DOCX:
388+
elements = unstructured_partition_docx(file=file)
389+
elif filetype == FileType.PPTX:
390+
elements = unstructured_partition_pptx(file=file)
391+
else:
392+
raise self._create_parse_error(
393+
remote_file,
394+
f"Unsupported file type {filetype} for local processing",
395+
)
391396
except Exception as e:
392397
raise self._create_parse_error(remote_file, str(e))
393398

@@ -438,21 +443,34 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
438443

439444
file.seek(0)
440445
try:
441-
file_content = file.read()
446+
file_content = file.read(4096) # Read a sample of the file to detect type
442447
file.seek(0)
448+
449+
if isinstance(file_content, bytes) and file_content.startswith(b'%PDF-'):
450+
return FileType.PDF
451+
452+
if isinstance(file_content, bytes) and file_content.startswith(b'PK\x03\x04'):
453+
if b'ppt/' in file_content or b'application/vnd.openxmlformats-officedocument.presentationml' in file_content:
454+
return FileType.PPTX
455+
elif b'word/' in file_content or b'[Content_Types].xml' in file_content:
456+
return FileType.DOCX
457+
443458
if file_content and isinstance(file_content, bytes):
444-
content_str = file_content.decode("utf-8", errors="ignore")
445-
if (
446-
content_str.lstrip().startswith("#")
447-
or remote_file.mime_type == "text/markdown"
448-
or remote_file.uri.endswith(".md")
449-
):
450-
type_based_on_content = FileType.MD
451-
else:
452-
type_based_on_content = FileType.UNK
453-
else:
454-
type_based_on_content = FileType.UNK
455-
except Exception:
459+
try:
460+
content_str = file_content.decode("utf-8", errors="ignore")
461+
if (
462+
content_str.lstrip().startswith("#")
463+
or remote_file.mime_type == "text/markdown"
464+
or remote_file.uri.endswith(".md")
465+
):
466+
return FileType.MD
467+
elif content_str.strip() and not any(c for c in content_str[:100] if ord(c) > 127):
468+
return FileType.TXT
469+
except UnicodeDecodeError:
470+
pass # Not a text file
471+
472+
type_based_on_content = FileType.UNK
473+
except Exception as e:
456474
type_based_on_content = FileType.UNK
457475
file.seek(0) # Reset file position after reading
458476

0 commit comments

Comments
 (0)