19
19
Iterable ,
20
20
List ,
21
21
Optional ,
22
- Tuple ,
22
+ Sequence ,
23
23
)
24
24
25
25
import pandas as pd
@@ -74,7 +74,7 @@ def path_like(self) -> Path:
74
74
@asynccontextmanager
75
75
async def popped_suffix_tempfile (
76
76
self ,
77
- ) -> AsyncContextManager [Tuple [Path , tempfile .NamedTemporaryFile ]]:
77
+ ) -> AsyncContextManager [tuple [Path , tempfile .NamedTemporaryFile ]]:
78
78
"""Create a temporary file with the same suffixes sans the last one.
79
79
80
80
This method creates a temporary file with the same suffixes as the
@@ -179,6 +179,7 @@ def get_files(self) -> AsyncIterator[ReadableFile]:
179
179
"""
180
180
raise NotImplementedError
181
181
182
+ @abstractmethod
182
183
def describe (self ) -> str :
183
184
"""Return a human-readable description of the file source.
184
185
@@ -187,7 +188,6 @@ def describe(self) -> str:
187
188
way that is understandable to the user. The description should be
188
189
concise and informative.
189
190
"""
190
- return str (self )
191
191
192
192
193
193
@SUPPORTED_FILE_FORMAT_REGISTRY .connect_baseclass
@@ -481,7 +481,7 @@ class RemoteFileSource(FileSource, alias="http"):
481
481
"""
482
482
483
483
def __init__ (
484
- self , urls : Iterable [str ], memory_spooling_max_size_in_mb : int = 10
484
+ self , urls : Sequence [str ], memory_spooling_max_size_in_mb : int = 10
485
485
) -> None :
486
486
self .urls = urls
487
487
self .memory_spooling_max_size = memory_spooling_max_size_in_mb * 1024 * 1024
@@ -528,7 +528,7 @@ def archive_if_required(self, key: str):
528
528
if not self .archive_dir :
529
529
return
530
530
531
- self .logger .info ("Archiving S3 Object" , extra = dict ( key = key ) )
531
+ self .logger .info ("Archiving S3 Object" , extra = { " key" : key } )
532
532
filename = Path (key ).name
533
533
self .s3_client .copy (
534
534
Bucket = self .bucket ,
@@ -617,6 +617,12 @@ async def get_files(self):
617
617
object_format = self .object_format ,
618
618
)
619
619
620
+ def describe (self ) -> str :
621
+ return (
622
+ f"S3FileSource{{bucket: { self .bucket } , prefix: { self .prefix } , "
623
+ f"archive_dir: { self .archive_dir } , object_format: { self .object_format } }}"
624
+ )
625
+
620
626
621
627
class FileExtractor (Extractor ):
622
628
"""A class that extracts records from files.
@@ -629,19 +635,19 @@ class FileExtractor(Extractor):
629
635
"""
630
636
631
637
@classmethod
632
- def local (cls , globs : Iterable [str ]):
638
+ def local (cls , globs : Iterable [str ]) -> "FileExtractor" :
633
639
return FileExtractor .from_file_data ([{"type" : "local" , "globs" : globs }])
634
640
635
641
@classmethod
636
- def s3 (cls , ** kwargs ):
642
+ def s3 (cls , ** kwargs ) -> "FileExtractor" :
637
643
return cls ([S3FileSource .from_file_data (** kwargs )])
638
644
639
645
@classmethod
640
646
def remote (
641
647
cls ,
642
648
urls : Iterable [str ],
643
649
memory_spooling_max_size_in_mb : int = 10 ,
644
- ):
650
+ ) -> "FileExtractor" :
645
651
return FileExtractor .from_file_data (
646
652
[
647
653
{
@@ -653,17 +659,19 @@ def remote(
653
659
)
654
660
655
661
@classmethod
656
- def from_file_data (cls , sources : List [ Dict [str , Any ]]) -> "FileExtractor" :
662
+ def from_file_data (cls , sources : list [ dict [str , Any ]]) -> "FileExtractor" :
657
663
return cls (
658
664
[FileSource .from_file_data_with_type_label (source ) for source in sources ]
659
665
)
660
666
661
- def __init__ (self , file_sources : Iterable [FileSource ]) -> None :
667
+ def __init__ (self , file_sources : Sequence [FileSource ]) -> None :
662
668
self .file_sources = file_sources
663
669
self .logger = getLogger (__name__ )
664
670
665
- async def read_file (self , file : ReadableFile ) -> Iterable [JsonLikeDocument ]:
666
- intermediaries : List [AsyncContextManager [ReadableFile ]] = []
671
+ async def read_file (
672
+ self , file : ReadableFile
673
+ ) -> AsyncGenerator [JsonLikeDocument , None ]:
674
+ intermediaries : list [AsyncContextManager [ReadableFile ]] = []
667
675
668
676
while True :
669
677
suffix = file .path_like ().suffix
@@ -695,10 +703,10 @@ async def read_file(self, file: ReadableFile) -> Iterable[JsonLikeDocument]:
695
703
pass
696
704
except Exception as e :
697
705
self .logger .warning (
698
- f"Failed to parse { file .path_like ()} file. Please ensure the file is in the correct format." ,
706
+ "Failed to parse %s file. Please ensure the file is in the correct format." ,
707
+ file .path_like (),
699
708
extra = {"exception" : str (e )},
700
709
)
701
- pass
702
710
703
711
# Regardless of whether we found a codec or not, break out of the
704
712
# loop and yield no more records because either (a) we found a
@@ -720,5 +728,6 @@ async def extract_records(self) -> AsyncGenerator[Any, Any]:
720
728
721
729
if total_files_from_source == 0 :
722
730
self .logger .warning (
723
- f"No files found for source: { file_source .describe ()} "
731
+ "No files found for source: %s" ,
732
+ file_source .describe (),
724
733
)
0 commit comments