Skip to content

Commit e77dc1b

Browse files
style: fix formatting issues
Co-Authored-By: Aaron <AJ> Steers <[email protected]>
1 parent a12b989 commit e77dc1b

File tree

1 file changed

+24
-12
lines changed

1 file changed

+24
-12
lines changed

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,9 @@ def parse_records(
174174
"content": markdown,
175175
"document_key": file.uri,
176176
"_ab_source_file_parse_error": None,
177-
"_ab_source_file_last_modified": file.last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
177+
"_ab_source_file_last_modified": file.last_modified.strftime(
178+
"%Y-%m-%dT%H:%M:%S.%fZ"
179+
),
178180
"_ab_source_file_url": file.uri,
179181
}
180182
except RecordParseError as e:
@@ -183,12 +185,16 @@ def parse_records(
183185
# otherwise, we raise the error to fail the sync
184186
if format.skip_unprocessable_files:
185187
exception_str = str(e)
186-
logger.warning(f"File {file.uri} caused an error during parsing: {exception_str}.")
188+
logger.warning(
189+
f"File {file.uri} caused an error during parsing: {exception_str}."
190+
)
187191
yield {
188192
"content": None,
189193
"document_key": file.uri,
190194
"_ab_source_file_parse_error": exception_str,
191-
"_ab_source_file_last_modified": file.last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
195+
"_ab_source_file_last_modified": file.last_modified.strftime(
196+
"%Y-%m-%dT%H:%M:%S.%fZ"
197+
),
192198
"_ab_source_file_url": file.uri,
193199
}
194200
logger.warning(f"File {file.uri} cannot be parsed. Skipping it.")
@@ -445,16 +451,20 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
445451
try:
446452
file_content = file.read(4096) # Read a sample of the file to detect type
447453
file.seek(0)
448-
449-
if isinstance(file_content, bytes) and file_content.startswith(b'%PDF-'):
454+
455+
if isinstance(file_content, bytes) and file_content.startswith(b"%PDF-"):
450456
return FileType.PDF
451-
452-
if isinstance(file_content, bytes) and file_content.startswith(b'PK\x03\x04'):
453-
if b'ppt/' in file_content or b'application/vnd.openxmlformats-officedocument.presentationml' in file_content:
457+
458+
if isinstance(file_content, bytes) and file_content.startswith(b"PK\x03\x04"):
459+
if (
460+
b"ppt/" in file_content
461+
or b"application/vnd.openxmlformats-officedocument.presentationml"
462+
in file_content
463+
):
454464
return FileType.PPTX
455-
elif b'word/' in file_content or b'[Content_Types].xml' in file_content:
465+
elif b"word/" in file_content or b"[Content_Types].xml" in file_content:
456466
return FileType.DOCX
457-
467+
458468
if file_content and isinstance(file_content, bytes):
459469
try:
460470
content_str = file_content.decode("utf-8", errors="ignore")
@@ -464,11 +474,13 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
464474
or remote_file.uri.endswith(".md")
465475
):
466476
return FileType.MD
467-
elif content_str.strip() and not any(c for c in content_str[:100] if ord(c) > 127):
477+
elif content_str.strip() and not any(
478+
c for c in content_str[:100] if ord(c) > 127
479+
):
468480
return FileType.TXT
469481
except UnicodeDecodeError:
470482
pass # Not a text file
471-
483+
472484
type_based_on_content = FileType.UNK
473485
except Exception as e:
474486
type_based_on_content = FileType.UNK

0 commit comments

Comments
 (0)