Skip to content

Commit ff1dc76

Browse files
committed
Update method names and signatures
1 parent 55e4a71 commit ff1dc76

File tree

4 files changed

+32
-31
lines changed

4 files changed

+32
-31
lines changed

dbldatagen/data_generator.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1917,25 +1917,26 @@ def scriptMerge(
19171917

19181918
return result
19191919

1920-
def buildOutputDataset(
1921-
self, output_dataset: OutputDataset,
1920+
def saveAsDataset(
1921+
self,
1922+
dataset: OutputDataset,
19221923
with_streaming: bool | None = None,
19231924
generator_options: dict[str, Any] | None = None
19241925
) -> StreamingQuery | None:
19251926
"""
1926-
Builds a `DataFrame` from the `DataGenerator` and writes the data to a target table.
1927+
Builds a `DataFrame` from the `DataGenerator` and writes the data to an output dataset (e.g. a table or files).
19271928
1928-
:param output_dataset: Output configuration for writing generated data
1929+
:param dataset: Output dataset for writing generated data
19291930
:param with_streaming: Whether to generate data using streaming. If None, auto-detects based on trigger
19301931
:param generator_options: Options for building the generator (e.g. `{"rowsPerSecond": 100}`)
19311932
:returns: A Spark `StreamingQuery` if data is written in streaming, otherwise `None`
19321933
"""
19331934
# Auto-detect streaming mode if not explicitly specified
19341935
if with_streaming is None:
1935-
with_streaming = output_dataset.trigger is not None and len(output_dataset.trigger) > 0
1936+
with_streaming = dataset.trigger is not None and len(dataset.trigger) > 0
19361937

19371938
df = self.build(withStreaming=with_streaming, options=generator_options)
1938-
return write_data_to_output(df, config=output_dataset)
1939+
return write_data_to_output(df, output_dataset=dataset)
19391940

19401941
@staticmethod
19411942
def loadFromJson(options: str) -> "DataGenerator":

dbldatagen/utils.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -366,38 +366,38 @@ def system_time_millis() -> int:
366366
return curr_time
367367

368368

369-
def write_data_to_output(df: DataFrame, config: OutputDataset) -> StreamingQuery | None:
369+
def write_data_to_output(df: DataFrame, output_dataset: OutputDataset) -> StreamingQuery | None:
370370
"""
371371
Writes a DataFrame to the sink configured in the output configuration.
372372
373373
:param df: Spark DataFrame to write
374-
:param config: Output configuration passed as an `OutputConfig`
374+
:param output_dataset: Output dataset configuration passed as an `OutputDataset`
375375
:returns: A Spark `StreamingQuery` if data is written in streaming, otherwise `None`
376376
"""
377377
if df.isStreaming:
378-
if not config.trigger:
378+
if not output_dataset.trigger:
379379
query = (
380-
df.writeStream.format(config.format)
381-
.outputMode(config.output_mode)
382-
.options(**config.options)
383-
.start(config.location)
380+
df.writeStream.format(output_dataset.format)
381+
.outputMode(output_dataset.output_mode)
382+
.options(**output_dataset.options)
383+
.start(output_dataset.location)
384384
)
385385
else:
386386
query = (
387-
df.writeStream.format(config.format)
388-
.outputMode(config.output_mode)
389-
.options(**config.options)
390-
.trigger(**config.trigger)
391-
.start(config.location)
387+
df.writeStream.format(output_dataset.format)
388+
.outputMode(output_dataset.output_mode)
389+
.options(**output_dataset.options)
390+
.trigger(**output_dataset.trigger)
391+
.start(output_dataset.location)
392392
)
393393
return query
394394

395395
else:
396396
(
397-
df.write.format(config.format)
398-
.mode(config.output_mode)
399-
.options(**config.options)
400-
.save(config.location)
397+
df.write.format(output_dataset.format)
398+
.mode(output_dataset.output_mode)
399+
.options(**output_dataset.options)
400+
.save(output_dataset.location)
401401
)
402402

403403
return None

docs/source/writing_generated_data.rst

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
Writing Generated Data to Tables or Files
55
===========================================================
66

7-
Generated data can be written directly to output tables or files using the ``OutputConfig`` class.
7+
Generated data can be written directly to output tables or files using the ``OutputDataset`` class.
88

99
Writing generated data to a table
1010
---------------------------------
1111

12-
Once you've defined a ``DataGenerator``, call the ``buildOutputDataset`` method to write data to a target table.
12+
Once you've defined a ``DataGenerator``, call the ``saveAsDataset`` method to write data to a target table.
1313

1414
.. code-block:: python
1515
@@ -28,7 +28,7 @@ Once you've defined a ``DataGenerator``, call the ``buildOutputDataset`` method
2828
outputDataset = OutputDataset("main.demo.users")
2929
3030
# Generate and write the output data:
31-
testDataSpec.buildOutputDataset(config=outputDataset)
31+
testDataSpec.saveAsDataset(dataset=outputDataset)
3232
3333
Writing generated data with streaming
3434
-------------------------------------
@@ -39,7 +39,7 @@ Python dictionaries (e.g. ``{"processingTime": "10 seconds"}`` to write data eve
3939
.. code-block:: python
4040
4141
import dbldatagen as dg
42-
from dbldatagen.config import OutputConfig
42+
from dbldatagen.config import OutputDataset
4343
4444
# Create a sample data generator with a few columns:
4545
testDataSpec = (
@@ -56,7 +56,7 @@ Python dictionaries (e.g. ``{"processingTime": "10 seconds"}`` to write data eve
5656
)
5757
5858
# Generate and write the output data:
59-
testDataSpec.buildOutputDataset(config=outputDataset)
59+
testDataSpec.saveAsDataset(dataset=outputDataset)
6060
6161
Options for writing data
6262
------------------------
@@ -85,7 +85,7 @@ Data will be written in append mode by default.
8585
)
8686
8787
# Generate and write the output data:
88-
testDataSpec.buildOutputDataset(config=outputDataset)
88+
testDataSpec.saveAsDataset(dataset=outputDataset)
8989
9090
Writing generated data to files
9191
-------------------------------
@@ -115,4 +115,4 @@ in Databricks File System (DBFS).
115115
)
116116
117117
# Generate and write the output data:
118-
testDataSpec.buildOutputDataset(config=outputDataset)
118+
testDataSpec.saveAsDataset(dataset=outputDataset)

tests/test_output.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def test_build_output_data_batch(self, get_output_directories, seed_column_name,
6464
options={"mergeSchema": "true"},
6565
)
6666

67-
gen.buildOutputDataset(output_dataset)
67+
gen.saveAsDataset(output_dataset)
6868
persisted_df = spark.read.format(table_format).load(table_dir)
6969
assert persisted_df.count() > 0
7070

@@ -100,7 +100,7 @@ def test_build_output_data_streaming(self, get_output_directories, seed_column_n
100100
trigger={"processingTime": "1 SECOND"}
101101
)
102102

103-
query = gen.buildOutputDataset(output_dataset, with_streaming=True)
103+
query = gen.saveAsDataset(output_dataset, with_streaming=True)
104104

105105
start_time = time.time()
106106
elapsed_time = 0

0 commit comments

Comments
 (0)