Update method names and signatures

ghanse · ghanse · commit ff1dc76db293 · 2025-09-22T15:10:21.000-04:00
diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -1917,25 +1917,26 @@ def scriptMerge(
 
         return result
 
-    def buildOutputDataset(
-            self, output_dataset: OutputDataset,
+    def saveAsDataset(
+            self,
+            dataset: OutputDataset,
             with_streaming: bool | None = None,
             generator_options: dict[str, Any] | None = None
     ) -> StreamingQuery | None:
         """
-        Builds a `DataFrame` from the `DataGenerator` and writes the data to a target table.
+        Builds a `DataFrame` from the `DataGenerator` and writes the data to an output dataset (e.g. a table or files).
 
-        :param output_dataset: Output configuration for writing generated data
+        :param dataset: Output dataset for writing generated data
         :param with_streaming: Whether to generate data using streaming. If None, auto-detects based on trigger
         :param generator_options: Options for building the generator (e.g. `{"rowsPerSecond": 100}`)
         :returns: A Spark `StreamingQuery` if data is written in streaming, otherwise `None`
         """
         # Auto-detect streaming mode if not explicitly specified
         if with_streaming is None:
-            with_streaming = output_dataset.trigger is not None and len(output_dataset.trigger) > 0
+            with_streaming = dataset.trigger is not None and len(dataset.trigger) > 0
 
         df = self.build(withStreaming=with_streaming, options=generator_options)
-        return write_data_to_output(df, config=output_dataset)
+        return write_data_to_output(df, output_dataset=dataset)
 
     @staticmethod
     def loadFromJson(options: str) -> "DataGenerator":
diff --git a/dbldatagen/utils.py b/dbldatagen/utils.py
@@ -366,38 +366,38 @@ def system_time_millis() -> int:
     return curr_time
 
 
-def write_data_to_output(df: DataFrame, config: OutputDataset) -> StreamingQuery | None:
+def write_data_to_output(df: DataFrame, output_dataset: OutputDataset) -> StreamingQuery | None:
     """
     Writes a DataFrame to the sink configured in the output configuration.
 
     :param df: Spark DataFrame to write
-    :param config: Output configuration passed as an `OutputConfig`
+    :param output_dataset: Output dataset configuration passed as an `OutputDataset`
     :returns: A Spark `StreamingQuery` if data is written in streaming, otherwise `None`
     """
     if df.isStreaming:
-        if not config.trigger:
+        if not output_dataset.trigger:
             query = (
-                df.writeStream.format(config.format)
-                .outputMode(config.output_mode)
-                .options(**config.options)
-                .start(config.location)
+                df.writeStream.format(output_dataset.format)
+                .outputMode(output_dataset.output_mode)
+                .options(**output_dataset.options)
+                .start(output_dataset.location)
             )
         else:
             query = (
-                df.writeStream.format(config.format)
-                .outputMode(config.output_mode)
-                .options(**config.options)
-                .trigger(**config.trigger)
-                .start(config.location)
+                df.writeStream.format(output_dataset.format)
+                .outputMode(output_dataset.output_mode)
+                .options(**output_dataset.options)
+                .trigger(**output_dataset.trigger)
+                .start(output_dataset.location)
             )
         return query
 
     else:
         (
-            df.write.format(config.format)
-            .mode(config.output_mode)
-            .options(**config.options)
-            .save(config.location)
+            df.write.format(output_dataset.format)
+            .mode(output_dataset.output_mode)
+            .options(**output_dataset.options)
+            .save(output_dataset.location)
         )
 
     return None
diff --git a/docs/source/writing_generated_data.rst b/docs/source/writing_generated_data.rst
@@ -4,12 +4,12 @@
 Writing Generated Data to Tables or Files
 ===========================================================
 
-Generated data can be written directly to output tables or files using the ``OutputConfig`` class.
+Generated data can be written directly to output tables or files using the ``OutputDataset`` class.
 
 Writing generated data to a table
 ---------------------------------
 
-Once you've defined a ``DataGenerator``, call the ``buildOutputDataset`` method to write data to a target table.
+Once you've defined a ``DataGenerator``, call the ``saveAsDataset`` method to write data to a target table.
 
 .. code-block:: python
 
@@ -28,7 +28,7 @@ Once you've defined a ``DataGenerator``, call the ``buildOutputDataset`` method
    outputDataset = OutputDataset("main.demo.users")
 
    # Generate and write the output data:
-   testDataSpec.buildOutputDataset(config=outputDataset)
+   testDataSpec.saveAsDataset(dataset=outputDataset)
 
 Writing generated data with streaming
 -------------------------------------
@@ -39,7 +39,7 @@ Python dictionaries (e.g. ``{"processingTime": "10 seconds"}`` to write data eve
 .. code-block:: python
 
    import dbldatagen as dg
-   from dbldatagen.config import OutputConfig
+   from dbldatagen.config import OutputDataset
 
    # Create a sample data generator with a few columns:
    testDataSpec = (
@@ -56,7 +56,7 @@ Python dictionaries (e.g. ``{"processingTime": "10 seconds"}`` to write data eve
    )
 
    # Generate and write the output data:
-   testDataSpec.buildOutputDataset(config=outputDataset)
+   testDataSpec.saveAsDataset(dataset=outputDataset)
 
 Options for writing data
 ------------------------
@@ -85,7 +85,7 @@ Data will be written in append mode by default.
    )
 
    # Generate and write the output data:
-   testDataSpec.buildOutputDataset(config=outputDataset)
+   testDataSpec.saveAsDataset(dataset=outputDataset)
 
 Writing generated data to files
 -------------------------------
@@ -115,4 +115,4 @@ in Databricks File System (DBFS).
    )
 
    # Generate and write the output data:
-   testDataSpec.buildOutputDataset(config=outputDataset)
+   testDataSpec.saveAsDataset(dataset=outputDataset)
diff --git a/tests/test_output.py b/tests/test_output.py
@@ -64,7 +64,7 @@ def test_build_output_data_batch(self, get_output_directories, seed_column_name,
             options={"mergeSchema": "true"},
         )
 
-        gen.buildOutputDataset(output_dataset)
+        gen.saveAsDataset(output_dataset)
         persisted_df = spark.read.format(table_format).load(table_dir)
         assert persisted_df.count() > 0
 
@@ -100,7 +100,7 @@ def test_build_output_data_streaming(self, get_output_directories, seed_column_n
             trigger={"processingTime": "1 SECOND"}
         )
 
-        query = gen.buildOutputDataset(output_dataset, with_streaming=True)
+        query = gen.saveAsDataset(output_dataset, with_streaming=True)
 
         start_time = time.time()
         elapsed_time = 0

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ def test_build_output_data_batch(self, get_output_directories, seed_column_name,`
`64`	`64`	`options={"mergeSchema": "true"},`
`65`	`65`	`)`
`66`	`66`
`67`		`- gen.buildOutputDataset(output_dataset)`
	`67`	`+ gen.saveAsDataset(output_dataset)`
`68`	`68`	`persisted_df = spark.read.format(table_format).load(table_dir)`
`69`	`69`	`assert persisted_df.count() > 0`
`70`	`70`
`@@ -100,7 +100,7 @@ def test_build_output_data_streaming(self, get_output_directories, seed_column_n`
`100`	`100`	`trigger={"processingTime": "1 SECOND"}`
`101`	`101`	`)`
`102`	`102`
`103`		`- query = gen.buildOutputDataset(output_dataset, with_streaming=True)`
	`103`	`+ query = gen.saveAsDataset(output_dataset, with_streaming=True)`
`104`	`104`
`105`	`105`	`start_time = time.time()`
`106`	`106`	`elapsed_time = 0`