Refactor

ghanse · ghanse · commit 55e4a715a95a · 2025-09-19T14:36:04.000-04:00
diff --git a/dbldatagen/config.py b/dbldatagen/config.py
@@ -5,7 +5,7 @@
 """
 This module implements configuration classes for writing generated data.
 """
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 
 
 @dataclass(frozen=True, slots=True)
@@ -23,5 +23,14 @@ class OutputDataset:
     location: str
     output_mode: str = "append"
     format: str = "delta"
-    options: dict[str, str] = field(default_factory=dict)
-    trigger: dict[str, bool | str] = field(default_factory=dict)
+    options: dict[str, str] | None = None
+    trigger: dict[str, str] | None = None
+
+    def __post_init__(self) -> None:
+        if not self.trigger:
+            return
+
+        # Only processingTime is currently supported
+        if "processingTime" not in self.trigger:
+            valid_trigger_format = '{"processingTime": "10 SECONDS"}'
+            raise ValueError(f"Attribute 'trigger' must be a dictionary of the form '{valid_trigger_format}'")
diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -15,6 +15,7 @@
 from typing import Any
 
 from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.streaming.query import StreamingQuery
 from pyspark.sql.types import DataType, IntegerType, LongType, StringType, StructField, StructType
 
 from dbldatagen import datagen_constants
@@ -1917,17 +1918,24 @@ def scriptMerge(
         return result
 
     def buildOutputDataset(
-            self, output_dataset: OutputDataset, generator_options: dict[str, Any] | None = None
-    ) -> None:
+            self, output_dataset: OutputDataset,
+            with_streaming: bool | None = None,
+            generator_options: dict[str, Any] | None = None
+    ) -> StreamingQuery | None:
         """
         Builds a `DataFrame` from the `DataGenerator` and writes the data to a target table.
 
         :param output_dataset: Output configuration for writing generated data
+        :param with_streaming: Whether to generate data using streaming. If None, auto-detects based on trigger
         :param generator_options: Options for building the generator (e.g. `{"rowsPerSecond": 100}`)
+        :returns: A Spark `StreamingQuery` if data is written in streaming, otherwise `None`
         """
-        with_streaming = output_dataset.trigger is not None
+        # Auto-detect streaming mode if not explicitly specified
+        if with_streaming is None:
+            with_streaming = output_dataset.trigger is not None and len(output_dataset.trigger) > 0
+
         df = self.build(withStreaming=with_streaming, options=generator_options)
-        write_data_to_output(df, config=output_dataset)
+        return write_data_to_output(df, config=output_dataset)
 
     @staticmethod
     def loadFromJson(options: str) -> "DataGenerator":
diff --git a/dbldatagen/utils.py b/dbldatagen/utils.py
@@ -19,6 +19,7 @@
 
 import jmespath
 from pyspark.sql import DataFrame
+from pyspark.sql.streaming.query import StreamingQuery
 
 from dbldatagen.config import OutputDataset
 
@@ -365,12 +366,13 @@ def system_time_millis() -> int:
     return curr_time
 
 
-def write_data_to_output(df: DataFrame, config: OutputDataset) -> None:
+def write_data_to_output(df: DataFrame, config: OutputDataset) -> StreamingQuery | None:
     """
     Writes a DataFrame to the sink configured in the output configuration.
 
     :param df: Spark DataFrame to write
     :param config: Output configuration passed as an `OutputConfig`
+    :returns: A Spark `StreamingQuery` if data is written in streaming, otherwise `None`
     """
     if df.isStreaming:
         if not config.trigger:
@@ -388,11 +390,14 @@ def write_data_to_output(df: DataFrame, config: OutputDataset) -> None:
                 .trigger(**config.trigger)
                 .start(config.location)
             )
-        query.awaitTermination()
+        return query
+
     else:
         (
             df.write.format(config.format)
             .mode(config.output_mode)
             .options(**config.options)
             .save(config.location)
         )
+
+    return None
diff --git a/tests/test_output.py b/tests/test_output.py
@@ -1,5 +1,6 @@
 import os
 import shutil
+import time
 import uuid
 import pytest
 
@@ -27,14 +28,15 @@ def get_output_directories(self):
         shutil.rmtree(base_dir, ignore_errors=True)
         print(f"\n\n*** test dir [{base_dir}] deleted")
 
-    @pytest.mark.parametrize("seed_column_name, table_format, table_location", [
-        ("id", "delta", "/table_folder"),
-        ("_id", "json", "/json_data_folder"),
-        ("id", "csv", "/csv_data_folder"),
-    ])
-    def test_build_output_data_batch(self, get_output_directories, seed_column_name, table_format, table_location):
+    @pytest.mark.parametrize("trigger", [{"availableNow": True}, {"once": True}, {"invalid": "yes"}])
+    def test_initialize_output_dataset_invalid_trigger(self, trigger):
+        with pytest.raises(ValueError, match=f"Attribute 'trigger' must be a dictionary of the form"):
+            _ = dg.OutputDataset(location="/location", trigger=trigger)
+
+    @pytest.mark.parametrize("seed_column_name, table_format", [("id", "parquet"), ("_id", "json"), ("id", "csv")])
+    def test_build_output_data_batch(self, get_output_directories, seed_column_name, table_format):
         base_dir, data_dir, checkpoint_dir = get_output_directories
-        table_dir = f"{data_dir}/{table_location}"
+        table_dir = f"{data_dir}/{uuid.uuid4()}"
 
         gen = dg.DataGenerator(
             sparkSession=spark,
@@ -59,21 +61,17 @@ def test_build_output_data_batch(self, get_output_directories, seed_column_name,
             location=table_dir,
             output_mode="append",
             format=table_format,
-            options={"mergeSchema": "true", "checkpointLocation": f"{data_dir}/{checkpoint_dir}"},
+            options={"mergeSchema": "true"},
         )
 
         gen.buildOutputDataset(output_dataset)
         persisted_df = spark.read.format(table_format).load(table_dir)
         assert persisted_df.count() > 0
 
-    @pytest.mark.parametrize("seed_column_name, table_format, table_location", [
-        ("id", "delta", "/table_folder"),
-        ("_id", "json", "/json_data_folder"),
-        ("id", "csv", "/csv_data_folder"),
-    ])
-    def test_build_output_data_streaming(self, get_output_directories, seed_column_name, table_format, table_location):
+    @pytest.mark.parametrize("seed_column_name, table_format", [("id", "parquet"), ("_id", "json"), ("id", "csv")])
+    def test_build_output_data_streaming(self, get_output_directories, seed_column_name, table_format):
         base_dir, data_dir, checkpoint_dir = get_output_directories
-        table_dir = f"{data_dir}/{table_location}"
+        table_dir = f"{data_dir}/{uuid.uuid4()}"
 
         gen = dg.DataGenerator(
             sparkSession=spark,
@@ -99,9 +97,19 @@ def test_build_output_data_streaming(self, get_output_directories, seed_column_n
             output_mode="append",
             format=table_format,
             options={"mergeSchema": "true", "checkpointLocation": f"{data_dir}/{checkpoint_dir}"},
-            trigger={"availableNow": True}
+            trigger={"processingTime": "1 SECOND"}
         )
 
-        gen.buildOutputDataset(output_dataset)
+        query = gen.buildOutputDataset(output_dataset, with_streaming=True)
+
+        start_time = time.time()
+        elapsed_time = 0
+        time_limit = 10.0
+
+        while elapsed_time < time_limit:
+            time.sleep(1)
+            elapsed_time = time.time() - start_time
+
+        query.stop()
         persisted_df = spark.read.format(table_format).load(table_dir)
         assert persisted_df.count() > 0