@@ -174,7 +174,9 @@ def parse_records(
174174 "content" : markdown ,
175175 "document_key" : file .uri ,
176176 "_ab_source_file_parse_error" : None ,
177- "_ab_source_file_last_modified" : file .last_modified .strftime ("%Y-%m-%dT%H:%M:%S.%fZ" ),
177+ "_ab_source_file_last_modified" : file .last_modified .strftime (
178+ "%Y-%m-%dT%H:%M:%S.%fZ"
179+ ),
178180 "_ab_source_file_url" : file .uri ,
179181 }
180182 except RecordParseError as e :
@@ -183,12 +185,16 @@ def parse_records(
183185 # otherwise, we raise the error to fail the sync
184186 if format .skip_unprocessable_files :
185187 exception_str = str (e )
186- logger .warning (f"File { file .uri } caused an error during parsing: { exception_str } ." )
188+ logger .warning (
189+ f"File { file .uri } caused an error during parsing: { exception_str } ."
190+ )
187191 yield {
188192 "content" : None ,
189193 "document_key" : file .uri ,
190194 "_ab_source_file_parse_error" : exception_str ,
191- "_ab_source_file_last_modified" : file .last_modified .strftime ("%Y-%m-%dT%H:%M:%S.%fZ" ),
195+ "_ab_source_file_last_modified" : file .last_modified .strftime (
196+ "%Y-%m-%dT%H:%M:%S.%fZ"
197+ ),
192198 "_ab_source_file_url" : file .uri ,
193199 }
194200 logger .warning (f"File { file .uri } cannot be parsed. Skipping it." )
@@ -445,16 +451,20 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
445451 try :
446452 file_content = file .read (4096 ) # Read a sample of the file to detect type
447453 file .seek (0 )
448-
449- if isinstance (file_content , bytes ) and file_content .startswith (b' %PDF-' ):
454+
455+ if isinstance (file_content , bytes ) and file_content .startswith (b" %PDF-" ):
450456 return FileType .PDF
451-
452- if isinstance (file_content , bytes ) and file_content .startswith (b'PK\x03 \x04 ' ):
453- if b'ppt/' in file_content or b'application/vnd.openxmlformats-officedocument.presentationml' in file_content :
457+
458+ if isinstance (file_content , bytes ) and file_content .startswith (b"PK\x03 \x04 " ):
459+ if (
460+ b"ppt/" in file_content
461+ or b"application/vnd.openxmlformats-officedocument.presentationml"
462+ in file_content
463+ ):
454464 return FileType .PPTX
455- elif b' word/' in file_content or b' [Content_Types].xml' in file_content :
465+ elif b" word/" in file_content or b" [Content_Types].xml" in file_content :
456466 return FileType .DOCX
457-
467+
458468 if file_content and isinstance (file_content , bytes ):
459469 try :
460470 content_str = file_content .decode ("utf-8" , errors = "ignore" )
@@ -464,11 +474,13 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
464474 or remote_file .uri .endswith (".md" )
465475 ):
466476 return FileType .MD
467- elif content_str .strip () and not any (c for c in content_str [:100 ] if ord (c ) > 127 ):
477+ elif content_str .strip () and not any (
478+ c for c in content_str [:100 ] if ord (c ) > 127
479+ ):
468480 return FileType .TXT
469481 except UnicodeDecodeError :
470482 pass # Not a text file
471-
483+
472484 type_based_on_content = FileType .UNK
473485 except Exception as e :
474486 type_based_on_content = FileType .UNK
0 commit comments