Implement FileIO.append (for CSV, JSON and NDJSON) (#44)

bh2smith · web-flow · commit 2606417a0d7a · 2022-11-18T13:47:06.000+01:00
Adding support for append functionality!
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ dist
 _version.py
 .idea/
 venv/
+tmp/
diff --git a/dune_client/file.py b/dune_client/file.py
@@ -6,8 +6,9 @@
 import logging
 import os.path
 from enum import Enum
+from os.path import exists
 from pathlib import Path
-from typing import TextIO
+from typing import TextIO, Callable, List, Tuple
 
 # ndjson missing types: https://github.com/rhgrant10/ndjson/issues/10
 import ndjson  # type: ignore
@@ -60,14 +61,17 @@ def load(self, file: TextIO) -> list[DuneRecord]:
             return list(ndjson.reader(file))
         raise ValueError(f"Unrecognized FileType {self} for {file.name}")
 
-    def write(self, out_file: TextIO, data: list[DuneRecord]) -> None:
+    def write(
+        self, out_file: TextIO, data: list[DuneRecord], skip_headers: bool = False
+    ) -> None:
         """Writes `data` to `out_file`"""
         logger.debug(f"writing results to file {out_file.name}")
         if self == FileType.CSV:
             headers = data[0].keys()
             data_tuple = [tuple(rec.values()) for rec in data]
             dict_writer = csv.DictWriter(out_file, headers, lineterminator="\n")
-            dict_writer.writeheader()
+            if not skip_headers:
+                dict_writer.writeheader()
             writer = csv.writer(out_file, lineterminator="\n")
             writer.writerows(data_tuple)
 
@@ -106,13 +110,77 @@ def _filepath(self, name: str, ftype: FileType) -> str:
         """Internal method for building absolute path."""
         return os.path.join(self.path, name + str(ftype))
 
-    def _write(self, data: list[DuneRecord], name: str, ftype: FileType) -> None:
-        # TODO - use @skip_empty decorator here. Couldn't get the types to work.
+    def _write(self, data: List[DuneRecord], name: str, ftype: FileType) -> None:
+        # The following three lines are duplicated in _append, due to python version compatibility
+        # https://github.com/cowprotocol/dune-client/issues/45
+        # We will continue to support python < 3.10 until ~3.13, this issue will remain open.
         if len(data) == 0:
             logger.info(f"Nothing to write to {name}... skipping")
-            return
+            return None
         with open(self._filepath(name, ftype), "w", encoding=self.encoding) as out_file:
             ftype.write(out_file, data)
+        return None
+
+    def _assert_matching_keys(
+        self, keys: Tuple[str, ...], fname: str, ftype: FileType
+    ) -> None:
+        with open(fname, "r", encoding=self.encoding) as file:
+            if ftype == FileType.CSV:
+                # Check matching headers.
+                headers = file.readline()
+                existing_keys = headers.strip().split(",")
+            elif ftype == FileType.JSON:
+                single_object = json.loads(file.readline())[0]
+                existing_keys = single_object.keys()
+            elif ftype == FileType.NDJSON:
+                single_object = json.loads(file.readline())
+                existing_keys = single_object.keys()
+
+            key_tuple = tuple(existing_keys)
+            assert keys == key_tuple, f"{keys} != {key_tuple}"
+
+    def _append(self, data: List[DuneRecord], name: str, ftype: FileType) -> None:
+        if len(data) == 0:
+            logger.info(f"Nothing to write to {name}... skipping")
+            return None
+        fname = self._filepath(name, ftype)
+        if not exists(fname):
+            logger.warning(
+                f"File {fname} does not exist, using write instead of append!"
+            )
+            return self._write(data, name, ftype)
+
+        # validate that the incoming content to be appended has the same schema
+        # The skip empty decorator ensures existence of data[0]!
+        self._assert_matching_keys(tuple(data[0].keys()), fname, ftype)
+
+        if ftype == FileType.JSON:
+            # These are JSON lists, so we have to concatenate the data.
+            with open(fname, "r", encoding=self.encoding) as existing_file:
+                existing_data = ftype.load(existing_file)
+            return self._write(existing_data + data, name, ftype)
+
+        with open(fname, "a+", encoding=self.encoding) as out_file:
+            return ftype.write(out_file, data, skip_headers=True)
+
+    def append_csv(self, data: list[DuneRecord], name: str) -> None:
+        """Appends `data` to csv file `name`"""
+        # This is a special case because we want to skip headers when the file already exists
+        # Additionally, we may want to validate that the headers actually coincide.
+        self._append(data, name, FileType.CSV)
+
+    def append_json(self, data: list[DuneRecord], name: str) -> None:
+        """
+        Appends `data` to json file `name`
+        This is the least efficient of all, since we have to load the entire file,
+        concatenate the lists and then overwrite the file!
+        Other filetypes such as CSV and NDJSON can be directly appended to!
+        """
+        self._append(data, name, FileType.JSON)
+
+    def append_ndjson(self, data: list[DuneRecord], name: str) -> None:
+        """Appends `data` to ndjson file `name`"""
+        self._append(data, name, FileType.NDJSON)
 
     def write_csv(self, data: list[DuneRecord], name: str) -> None:
         """Writes `data` to csv file `name`"""
@@ -154,3 +222,6 @@ def load_singleton(
     ) -> DuneRecord:
         """Loads and returns single entry by index (default 0)"""
         return self._load(name, self._parse_ftype(ftype))[index]
+
+
+WriteLikeSignature = Callable[[FileIO, List[DuneRecord], str, FileType], None]
diff --git a/tests/unit/test_file.py b/tests/unit/test_file.py
@@ -16,7 +16,7 @@ def cleanup():
 
 
 def cleanup_files(func):
-    """This decorator can be used for testing methods outside of this class"""
+    """This decorator can be used for testing methods outside this class"""
 
     def wrapped_func(self):
         func(self)
@@ -48,6 +48,30 @@ def test_invertible_write_and_load(self):
                 f"Assert invertible failed on {ftype}",
             )
 
+    def test_append_ok(self):
+        for ftype in FileType:
+            self.file_manager._write(self.dune_records, TEST_FILE, ftype)
+            self.file_manager._append(self.dune_records, TEST_FILE, ftype)
+            loaded_records = self.file_manager._load(TEST_FILE, ftype)
+            expected = self.dune_records + self.dune_records
+            self.assertEqual(
+                expected,
+                loaded_records,
+                f"test_append failed on {ftype}",
+            )
+
+    def test_append_calls_write_on_new_file(self):
+        for ftype in FileType:
+            with self.assertLogs(level="WARNING"):
+                self.file_manager._append(self.dune_records, TEST_FILE, ftype)
+
+    def test_append_error(self):
+        invalid_records = [{}]  # Empty dict has different keys than self.dune_records
+        for ftype in FileType:
+            self.file_manager._write(self.dune_records, TEST_FILE, ftype)
+            with self.assertRaises(AssertionError):
+                self.file_manager._append(invalid_records, TEST_FILE, ftype)
+
     def test_load_singleton(self):
         for file_type in FileType:
             self.file_manager._write(self.dune_records, TEST_FILE, file_type)

-Original file line number
+Diff line change
 _version.py
 .idea/
 venv/
 +tmp/