fix: Write all record batches to the same file without overwriting rows (#195)

edgarrmondragon · web-flow · commit 75b1df8d2a7d · 2025-05-02T15:17:02.000-06:00
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ python = ">=3.8"
 pytz = "~=2025.1"
 singer-sdk = "~=0.42.1"
 
-[tool.poetry.dev-dependencies]
+[tool.poetry.group.dev.dependencies]
 pytest = "~=8.3"
 
 [tool.poetry.scripts]
@@ -45,8 +45,6 @@ target-version = "py38"
 
 [tool.ruff.lint]
 ignore = [
-    "ANN101", # Missing type annotation for `self` in method
-    "ANN102", # Missing type annotation for `cls` in class method
     "ANN401", # Allow `typing.Any` as parameter type
 ]
 select = [
diff --git a/target_csv/serialization.py b/target_csv/serialization.py
@@ -1,36 +1,20 @@
-import csv  # noqa: D100
-import sys
-from pathlib import Path
-from typing import Any, List, Callable, TypeVar
-
-if sys.version_info < (3, 10):
-    from typing_extensions import Concatenate, ParamSpec
-else:
-    from typing import Concatenate, ParamSpec
-
-P = ParamSpec("P")
-T = TypeVar("T")
-
+"""Serialization utilities for CSV files."""
 
-def create_folder_if_not_exists(
-    func: Callable[Concatenate[Path, P], T],
-) -> Callable[Concatenate[Path, P], T]:
-    """Decorator to create folder if it does not exist."""
+from __future__ import annotations
 
-    def wrapper(filepath: Path, *args: P.args, **kwargs: P.kwargs) -> T:
-        filepath.parent.mkdir(parents=True, exist_ok=True)
-        return func(filepath, *args, **kwargs)
-
-    return wrapper
+import csv  # noqa: D100
+import tempfile
+from pathlib import Path
+from typing import Any
 
 
-@create_folder_if_not_exists
-def write_csv(filepath: Path, records: List[dict], schema: dict, **kwargs: Any) -> int:
+def write_csv(
+    filepath: Path,
+    records: list[dict],
+    keys: list[str],
+    **kwargs: Any,
+) -> int:
     """Write a CSV file."""
-    if "properties" not in schema:
-        raise ValueError("Stream's schema has no properties defined.")
-
-    keys: List[str] = list(schema["properties"].keys())
     with open(filepath, "w", encoding="utf-8", newline="") as fp:
         writer = csv.DictWriter(fp, fieldnames=keys, dialect="excel", **kwargs)
         writer.writeheader()
@@ -40,9 +24,37 @@ def write_csv(filepath: Path, records: List[dict], schema: dict, **kwargs: Any)
     return record_count
 
 
-def read_csv(filepath: Path) -> List[dict]:
+def write_header(filepath: Path, keys: list[str], **kwargs: Any) -> None:
+    """Write a header to a CSV file.
+
+    Creates the parent directory if it doesn't exist.
+    """
+    filepath.parent.mkdir(parents=True, exist_ok=True)
+    with filepath.open("w", encoding="utf-8", newline="") as fp:
+        writer = csv.DictWriter(fp, fieldnames=keys, **kwargs)
+        writer.writeheader()
+
+
+def write_batch(
+    filepath: Path,
+    records: list[dict],
+    keys: list[str],
+    **kwargs: Any,
+) -> None:
+    """Write a batch of records to a CSV file."""
+    with tempfile.NamedTemporaryFile("w+", encoding="utf-8", newline="") as tmp_fp:
+        writer = csv.DictWriter(tmp_fp, fieldnames=keys, **kwargs)
+        writer.writerows(records)
+
+        tmp_fp.seek(0)
+
+        with filepath.open("a") as f:
+            f.write(tmp_fp.read())
+
+
+def read_csv(filepath: Path) -> list[dict]:
     """Read a CSV file."""
-    result: List[dict] = []
+    result: list[dict] = []
     with open(filepath, newline="") as fp:
         reader = csv.DictReader(fp, delimiter=",", dialect="excel")
         result.extend(iter(reader))
diff --git a/target_csv/sinks.py b/target_csv/sinks.py
@@ -1,6 +1,9 @@
 """CSV target sink class, which handles writing streams."""
 
+from __future__ import annotations
+
 import datetime
+import functools
 import sys
 import warnings
 from pathlib import Path
@@ -10,7 +13,7 @@
 from singer_sdk import Target
 from singer_sdk.sinks import BatchSink
 
-from target_csv.serialization import write_csv
+from target_csv.serialization import write_batch, write_header
 
 
 class CSVSink(BatchSink):
@@ -77,32 +80,50 @@ def output_file(self) -> Path:  # noqa: D102
 
         return filepath
 
+    @functools.cached_property
+    def keys(self) -> list[str]:
+        """Get the header keys for the CSV file."""
+        if "properties" not in self.schema:
+            raise ValueError("Stream's schema has no properties defined")
+
+        return list(self.schema["properties"].keys())
+
+    @functools.cached_property
+    def escape_character(self) -> str | None:
+        """Get the escape character for the CSV file."""
+        return self.config.get("escape_character")
+
+    def setup(self) -> None:
+        """Create the output file and write the header."""
+        super().setup()
+        output_file = self.output_file
+        self.logger.info("Writing to destination file '%s'...", output_file.resolve())
+        write_header(
+            output_file,
+            self.keys,
+            dialect="excel",
+            escapechar=self.escape_character,
+        )
+
     def process_batch(self, context: dict) -> None:
         """Write out any prepped records and return once fully written."""
         output_file: Path = self.output_file
-        self.logger.info(f"Writing to destination file '{output_file.resolve()}'...")
-        new_contents: dict  # noqa: F842
-        create_new = (
-            self.config["overwrite_behavior"] == "replace_file"
-            or not output_file.exists()
-        )
-        if not create_new:
-            raise NotImplementedError("Append mode is not yet supported.")
 
         if not isinstance(context["records"], list):
-            self.logger.warning(f"No values in {self.stream_name} records collection.")
+            self.logger.warning("No values in %s records collection.", self.stream_name)
             context["records"] = []
 
         records: List[Dict[str, Any]] = context["records"]
         if "record_sort_property_name" in self.config:
             sort_property_name = self.config["record_sort_property_name"]
             records = sorted(records, key=lambda x: x[sort_property_name])
 
-        self.logger.info(f"Writing {len(context['records'])} records to file...")
+        self.logger.info(f"Appending {len(records)} records to file...")
 
-        write_csv(
+        write_batch(
             output_file,
-            context["records"],
-            self.schema,
-            escapechar=self.config.get("escape_character"),
+            records,
+            self.keys,
+            dialect="excel",
+            escapechar=self.escape_character,
         )
diff --git a/target_csv/target.py b/target_csv/target.py
@@ -107,7 +107,7 @@ class TargetCSV(Target):
         ),
         th.Property(
             "escape_character",
-            th.StringType,
+            th.StringType(min_length=1, max_length=1),
             description="The character to use for escaping special characters.",
         ),
     ).to_dict()
diff --git a/tests/test_csv.py b/tests/test_csv.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from target_csv.serialization import read_csv, write_csv
+from target_csv.serialization import read_csv, write_batch, write_header
 
 SAMPLE_DATASETS: List[Tuple[Dict, List[Dict[str, Any]]]] = [
     (
@@ -70,18 +70,24 @@ def test_file_paths(output_dir) -> List[Path]:
 
 def test_csv_write(output_filepath) -> None:
     for schema, records in SAMPLE_DATASETS:
-        write_csv(filepath=output_filepath, records=records, schema=schema)
+        keys = list(schema["properties"].keys())
+        write_header(filepath=output_filepath, keys=keys)
+        write_batch(filepath=output_filepath, records=records, keys=keys)
 
 
 def test_csv_write_if_not_exists(test_file_paths) -> None:
     for path in test_file_paths:
         for schema, records in SAMPLE_DATASETS:
-            write_csv(filepath=path, records=records, schema=schema)
+            keys = list(schema["properties"].keys())
+            write_header(filepath=path, keys=keys)
+            write_batch(filepath=path, records=records, keys=keys)
 
 
 def test_csv_roundtrip(output_filepath) -> None:
     for schema, records in SAMPLE_DATASETS:
-        write_csv(filepath=output_filepath, records=records, schema=schema)
+        keys = list(schema["properties"].keys())
+        write_header(filepath=output_filepath, keys=keys)
+        write_batch(filepath=output_filepath, records=records, keys=keys)
         read_records = read_csv(filepath=output_filepath)
         for orig_record, new_record in zip(records, read_records):
             for key in orig_record.keys():