11import os
22import shutil
3+ import time
34import uuid
45import pytest
56
@@ -27,14 +28,15 @@ def get_output_directories(self):
2728 shutil .rmtree (base_dir , ignore_errors = True )
2829 print (f"\n \n *** test dir [{ base_dir } ] deleted" )
2930
30- @pytest .mark .parametrize ("seed_column_name, table_format, table_location" , [
31- ("id" , "delta" , "/table_folder" ),
32- ("_id" , "json" , "/json_data_folder" ),
33- ("id" , "csv" , "/csv_data_folder" ),
34- ])
35- def test_build_output_data_batch (self , get_output_directories , seed_column_name , table_format , table_location ):
31+ @pytest .mark .parametrize ("trigger" , [{"availableNow" : True }, {"once" : True }, {"invalid" : "yes" }])
32+ def test_initialize_output_dataset_invalid_trigger (self , trigger ):
33+ with pytest .raises (ValueError , match = f"Attribute 'trigger' must be a dictionary of the form" ):
34+ _ = dg .OutputDataset (location = "/location" , trigger = trigger )
35+
36+ @pytest .mark .parametrize ("seed_column_name, table_format" , [("id" , "parquet" ), ("_id" , "json" ), ("id" , "csv" )])
37+ def test_build_output_data_batch (self , get_output_directories , seed_column_name , table_format ):
3638 base_dir , data_dir , checkpoint_dir = get_output_directories
37- table_dir = f"{ data_dir } /{ table_location } "
39+ table_dir = f"{ data_dir } /{ uuid . uuid4 () } "
3840
3941 gen = dg .DataGenerator (
4042 sparkSession = spark ,
@@ -59,21 +61,17 @@ def test_build_output_data_batch(self, get_output_directories, seed_column_name,
5961 location = table_dir ,
6062 output_mode = "append" ,
6163 format = table_format ,
62- options = {"mergeSchema" : "true" , "checkpointLocation" : f" { data_dir } / { checkpoint_dir } " },
64+ options = {"mergeSchema" : "true" },
6365 )
6466
6567 gen .buildOutputDataset (output_dataset )
6668 persisted_df = spark .read .format (table_format ).load (table_dir )
6769 assert persisted_df .count () > 0
6870
69- @pytest .mark .parametrize ("seed_column_name, table_format, table_location" , [
70- ("id" , "delta" , "/table_folder" ),
71- ("_id" , "json" , "/json_data_folder" ),
72- ("id" , "csv" , "/csv_data_folder" ),
73- ])
74- def test_build_output_data_streaming (self , get_output_directories , seed_column_name , table_format , table_location ):
71+ @pytest .mark .parametrize ("seed_column_name, table_format" , [("id" , "parquet" ), ("_id" , "json" ), ("id" , "csv" )])
72+ def test_build_output_data_streaming (self , get_output_directories , seed_column_name , table_format ):
7573 base_dir , data_dir , checkpoint_dir = get_output_directories
76- table_dir = f"{ data_dir } /{ table_location } "
74+ table_dir = f"{ data_dir } /{ uuid . uuid4 () } "
7775
7876 gen = dg .DataGenerator (
7977 sparkSession = spark ,
@@ -99,9 +97,19 @@ def test_build_output_data_streaming(self, get_output_directories, seed_column_n
9997 output_mode = "append" ,
10098 format = table_format ,
10199 options = {"mergeSchema" : "true" , "checkpointLocation" : f"{ data_dir } /{ checkpoint_dir } " },
102- trigger = {"availableNow " : True }
100+ trigger = {"processingTime " : "1 SECOND" }
103101 )
104102
105- gen .buildOutputDataset (output_dataset )
103+ query = gen .buildOutputDataset (output_dataset , with_streaming = True )
104+
105+ start_time = time .time ()
106+ elapsed_time = 0
107+ time_limit = 10.0
108+
109+ while elapsed_time < time_limit :
110+ time .sleep (1 )
111+ elapsed_time = time .time () - start_time
112+
113+ query .stop ()
106114 persisted_df = spark .read .format (table_format ).load (table_dir )
107115 assert persisted_df .count () > 0
0 commit comments