nodestream-proj
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nodestream/databases/writer.py‎
Lines changed: 1 addition & 1 deletion b/‎nodestream/databases/writer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nodestream/pipeline/extractors/credential_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎nodestream/pipeline/extractors/credential_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nodestream/pipeline/extractors/files.py‎
Lines changed: 24 additions & 15 deletions b/‎nodestream/pipeline/extractors/files.py‎
Lines changed: 24 additions & 15 deletions
@@ -160,4 +160,5 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/
+.idea/
+pytest.xml
@@ -14,7 +14,7 @@ def from_file_data(
         ),
         collect_stats: bool = True,
         batch_size: int = 1000,
-        **database_args
+        **database_args,
     ):
         connector = DatabaseConnector.from_database_args(
             database=database, **database_args
 
@@ -15,7 +15,7 @@ def __init__(
         assume_role_arn: Optional[str] = None,
         assume_role_external_id: Optional[str] = None,
         session_ttl: int = 3000,
-        **boto_session_args
+        **boto_session_args,
     ) -> None:
         self.assume_role_arn = assume_role_arn
         self.assume_role_external_id = assume_role_external_id
 
@@ -19,7 +19,7 @@
     Iterable,
     List,
     Optional,
-    Tuple,
+    Sequence,
 )
 
 import pandas as pd
@@ -74,7 +74,7 @@ def path_like(self) -> Path:
     @asynccontextmanager
     async def popped_suffix_tempfile(
         self,
-    ) -> AsyncContextManager[Tuple[Path, tempfile.NamedTemporaryFile]]:
+    ) -> AsyncContextManager[tuple[Path, tempfile.NamedTemporaryFile]]:
         """Create a temporary file with the same suffixes sans the last one.
 
         This method creates a temporary file with the same suffixes as the
@@ -179,6 +179,7 @@ def get_files(self) -> AsyncIterator[ReadableFile]:
         """
         raise NotImplementedError
 
+    @abstractmethod
     def describe(self) -> str:
         """Return a human-readable description of the file source.
 
@@ -187,7 +188,6 @@ def describe(self) -> str:
         way that is understandable to the user. The description should be
         concise and informative.
         """
-        return str(self)
 
 
 @SUPPORTED_FILE_FORMAT_REGISTRY.connect_baseclass
@@ -481,7 +481,7 @@ class RemoteFileSource(FileSource, alias="http"):
     """
 
     def __init__(
-        self, urls: Iterable[str], memory_spooling_max_size_in_mb: int = 10
+        self, urls: Sequence[str], memory_spooling_max_size_in_mb: int = 10
     ) -> None:
         self.urls = urls
         self.memory_spooling_max_size = memory_spooling_max_size_in_mb * 1024 * 1024
@@ -528,7 +528,7 @@ def archive_if_required(self, key: str):
         if not self.archive_dir:
             return
 
-        self.logger.info("Archiving S3 Object", extra=dict(key=key))
+        self.logger.info("Archiving S3 Object", extra={"key": key})
         filename = Path(key).name
         self.s3_client.copy(
             Bucket=self.bucket,
@@ -617,6 +617,12 @@ async def get_files(self):
                 object_format=self.object_format,
             )
 
+    def describe(self) -> str:
+        return (
+            f"S3FileSource{{bucket: {self.bucket}, prefix: {self.prefix}, "
+            f"archive_dir: {self.archive_dir}, object_format: {self.object_format}}}"
+        )
+
 
 class FileExtractor(Extractor):
     """A class that extracts records from files.
@@ -629,19 +635,19 @@ class FileExtractor(Extractor):
     """
 
     @classmethod
-    def local(cls, globs: Iterable[str]):
+    def local(cls, globs: Iterable[str]) -> "FileExtractor":
         return FileExtractor.from_file_data([{"type": "local", "globs": globs}])
 
     @classmethod
-    def s3(cls, **kwargs):
+    def s3(cls, **kwargs) -> "FileExtractor":
         return cls([S3FileSource.from_file_data(**kwargs)])
 
     @classmethod
     def remote(
         cls,
         urls: Iterable[str],
         memory_spooling_max_size_in_mb: int = 10,
-    ):
+    ) -> "FileExtractor":
         return FileExtractor.from_file_data(
             [
                 {
@@ -653,17 +659,19 @@ def remote(
         )
 
     @classmethod
-    def from_file_data(cls, sources: List[Dict[str, Any]]) -> "FileExtractor":
+    def from_file_data(cls, sources: list[dict[str, Any]]) -> "FileExtractor":
         return cls(
             [FileSource.from_file_data_with_type_label(source) for source in sources]
         )
 
-    def __init__(self, file_sources: Iterable[FileSource]) -> None:
+    def __init__(self, file_sources: Sequence[FileSource]) -> None:
         self.file_sources = file_sources
         self.logger = getLogger(__name__)
 
-    async def read_file(self, file: ReadableFile) -> Iterable[JsonLikeDocument]:
-        intermediaries: List[AsyncContextManager[ReadableFile]] = []
+    async def read_file(
+        self, file: ReadableFile
+    ) -> AsyncGenerator[JsonLikeDocument, None]:
+        intermediaries: list[AsyncContextManager[ReadableFile]] = []
 
         while True:
             suffix = file.path_like().suffix
@@ -695,10 +703,10 @@ async def read_file(self, file: ReadableFile) -> Iterable[JsonLikeDocument]:
                 pass
             except Exception as e:
                 self.logger.warning(
-                    f"Failed to parse {file.path_like()} file. Please ensure the file is in the correct format.",
+                    "Failed to parse %s file. Please ensure the file is in the correct format.",
+                    file.path_like(),
                     extra={"exception": str(e)},
                 )
-                pass
 
             # Regardless of whether we found a codec or not, break out of the
             # loop and yield no more records because either (a) we found a
@@ -720,5 +728,6 @@ async def extract_records(self) -> AsyncGenerator[Any, Any]:
 
             if total_files_from_source == 0:
                 self.logger.warning(
-                    f"No files found for source: {file_source.describe()}"
+                    "No files found for source: %s",
+                    file_source.describe(),
                 )