1515)
1616
1717
18- def create_dataframe (df_type , data , column_name , spark ):
19- """Helper function to create pandas or pyspark DataFrame."""
18+ def create_dataframe (df_type , data , column_name , spark , data_type = "long" ):
19+ """Helper function to create pandas or pyspark DataFrame.
20+
21+ Args:
22+ df_type: "pandas" or "pyspark"
23+ data: List of values for the column
24+ column_name: Name of the column
25+ spark: Spark session (required for pyspark)
26+ data_type: Data type for the column - "long", "string", "double", "boolean", "timestamp"
27+ """
2028 if df_type == "pandas" :
2129 return pd .DataFrame (data , columns = [column_name ])
2230 else : # pyspark
23- # Use explicit schema for all PySpark DataFrames (required for empty DataFrames)
24- from pyspark .sql .types import StructType , StructField , LongType
25-
26- schema = StructType ([StructField (column_name , LongType (), True )])
27- return spark .createDataFrame ([(val ,) for val in data ], schema )
31+ # Use explicit schema for all PySpark DataFrames
32+ from pyspark .sql .types import (
33+ StructType ,
34+ StructField ,
35+ LongType ,
36+ StringType ,
37+ DoubleType ,
38+ BooleanType ,
39+ TimestampType ,
40+ )
2841
42+ type_mapping = {
43+ "long" : LongType (),
44+ "string" : StringType (),
45+ "double" : DoubleType (),
46+ "boolean" : BooleanType (),
47+ "timestamp" : TimestampType (),
48+ }
2949
30- def get_df_type_enum (df_type ):
31- """Get DataFrameType enum value."""
32- return DataFrameType .PANDAS if df_type == "pandas" else DataFrameType .PYSPARK
50+ schema = StructType ([StructField (column_name , type_mapping [data_type ], True )])
51+ return spark .createDataFrame ([(val ,) for val in data ], schema )
3352
3453
3554def test_expectation_name ():
@@ -48,23 +67,23 @@ def test_expectation_name():
4867
4968
5069@pytest .mark .parametrize (
51- "df_type, df_data, min_value, max_value, expected_result, expected_message" ,
70+ "df_type, df_data, min_value, max_value, expected_result, expected_message, data_type " ,
5271 [
5372 # Basic success - 3 distinct values within range [2, 5]
54- ("pandas" , [1 , 2 , 3 , 2 , 1 ], 2 , 5 , "success" , None ),
55- ("pyspark" , [1 , 2 , 3 , 2 , 1 ], 2 , 5 , "success" , None ),
73+ ("pandas" , [1 , 2 , 3 , 2 , 1 ], 2 , 5 , "success" , None , "long" ),
74+ ("pyspark" , [1 , 2 , 3 , 2 , 1 ], 2 , 5 , "success" , None , "long" ),
5675 # Success with nulls - 3 distinct values [1, 2, None] within range [3, 4]
57- ("pandas" , [1 , 2 , None , 2 , 1 ], 3 , 4 , "success" , None ),
58- ("pyspark" , [1 , 2 , None , 2 , 1 ], 3 , 4 , "success" , None ),
76+ ("pandas" , [1 , 2 , None , 2 , 1 ], 3 , 4 , "success" , None , "long" ),
77+ ("pyspark" , [1 , 2 , None , 2 , 1 ], 3 , 4 , "success" , None , "long" ),
5978 # Exact minimum boundary - 3 distinct values at min boundary [3, 5]
60- ("pandas" , [1 , 2 , 3 , 2 , 1 ], 3 , 5 , "success" , None ),
61- ("pyspark" , [1 , 2 , 3 , 2 , 1 ], 3 , 5 , "success" , None ),
79+ ("pandas" , [1 , 2 , 3 , 2 , 1 ], 3 , 5 , "success" , None , "long" ),
80+ ("pyspark" , [1 , 2 , 3 , 2 , 1 ], 3 , 5 , "success" , None , "long" ),
6281 # Exact maximum boundary - 5 distinct values at max boundary [3, 5]
63- ("pandas" , [1 , 2 , 3 , 4 , 5 , 1 ], 3 , 5 , "success" , None ),
64- ("pyspark" , [1 , 2 , 3 , 4 , 5 , 1 ], 3 , 5 , "success" , None ),
82+ ("pandas" , [1 , 2 , 3 , 4 , 5 , 1 ], 3 , 5 , "success" , None , "long" ),
83+ ("pyspark" , [1 , 2 , 3 , 4 , 5 , 1 ], 3 , 5 , "success" , None , "long" ),
6584 # Edge case: zero range (min == max) - success with exact match
66- ("pandas" , [1 , 2 , 3 , 2 , 1 ], 3 , 3 , "success" , None ),
67- ("pyspark" , [1 , 2 , 3 , 2 , 1 ], 3 , 3 , "success" , None ),
85+ ("pandas" , [1 , 2 , 3 , 2 , 1 ], 3 , 3 , "success" , None , "long" ),
86+ ("pyspark" , [1 , 2 , 3 , 2 , 1 ], 3 , 3 , "success" , None , "long" ),
6887 # Edge case: zero range (min == max) - failure when not exact
6988 (
7089 "pandas" ,
@@ -73,6 +92,7 @@ def test_expectation_name():
7392 3 ,
7493 "failure" ,
7594 "Column 'col1' has 2 distinct values, expected between 3 and 3." ,
95+ "long" ,
7696 ),
7797 (
7898 "pyspark" ,
@@ -81,13 +101,14 @@ def test_expectation_name():
81101 3 ,
82102 "failure" ,
83103 "Column 'col1' has 2 distinct values, expected between 3 and 3." ,
104+ "long" ,
84105 ),
85106 # Edge case: empty DataFrame - 0 distinct values within range [0, 5]
86- ("pandas" , [], 0 , 5 , "success" , None ),
87- ("pyspark" , [], 0 , 5 , "success" , None ),
107+ ("pandas" , [], 0 , 5 , "success" , None , "long" ),
108+ ("pyspark" , [], 0 , 5 , "success" , None , "long" ),
88109 # Edge case: single distinct value - 1 distinct value within range [1, 1]
89- ("pandas" , [1 , 1 , 1 , 1 , 1 ], 1 , 1 , "success" , None ),
90- ("pyspark" , [1 , 1 , 1 , 1 , 1 ], 1 , 1 , "success" , None ),
110+ ("pandas" , [1 , 1 , 1 , 1 , 1 ], 1 , 1 , "success" , None , "long" ),
111+ ("pyspark" , [1 , 1 , 1 , 1 , 1 ], 1 , 1 , "success" , None , "long" ),
91112 # Too few distinct values - 2 distinct, expecting [4, 6]
92113 (
93114 "pandas" ,
@@ -96,6 +117,7 @@ def test_expectation_name():
96117 6 ,
97118 "failure" ,
98119 "Column 'col1' has 2 distinct values, expected between 4 and 6." ,
120+ "long" ,
99121 ),
100122 (
101123 "pyspark" ,
@@ -104,6 +126,7 @@ def test_expectation_name():
104126 6 ,
105127 "failure" ,
106128 "Column 'col1' has 2 distinct values, expected between 4 and 6." ,
129+ "long" ,
107130 ),
108131 # Too many distinct values - 5 distinct, expecting [2, 3]
109132 (
@@ -113,6 +136,7 @@ def test_expectation_name():
113136 3 ,
114137 "failure" ,
115138 "Column 'col1' has 5 distinct values, expected between 2 and 3." ,
139+ "long" ,
116140 ),
117141 (
118142 "pyspark" ,
@@ -121,7 +145,52 @@ def test_expectation_name():
121145 3 ,
122146 "failure" ,
123147 "Column 'col1' has 5 distinct values, expected between 2 and 3." ,
148+ "long" ,
149+ ),
150+ # Data type validation scenarios (consolidated from test_data_type_validation)
151+ # String column with mixed values including None
152+ ("pandas" , ["A" , "B" , "C" , "B" , "A" , None ], 3 , 5 , "success" , None , "string" ),
153+ ("pyspark" , ["A" , "B" , "C" , "B" , "A" , None ], 3 , 5 , "success" , None , "string" ),
154+ # Float column
155+ ("pandas" , [1.1 , 2.2 , 3.3 , 2.2 , 1.1 ], 2 , 4 , "success" , None , "double" ),
156+ ("pyspark" , [1.1 , 2.2 , 3.3 , 2.2 , 1.1 ], 2 , 4 , "success" , None , "double" ),
157+ # Boolean column
158+ ("pandas" , [True , False , True , False , True ], 2 , 2 , "success" , None , "boolean" ),
159+ ("pyspark" , [True , False , True , False , True ], 2 , 2 , "success" , None , "boolean" ),
160+ # Datetime column - pandas with pd.to_datetime
161+ (
162+ "pandas" ,
163+ pd .to_datetime (["2023-01-01" , "2023-01-02" , "2023-01-03" , "2023-01-02" , "2023-01-01" ]),
164+ 2 ,
165+ 4 ,
166+ "success" ,
167+ None ,
168+ "timestamp" ,
169+ ),
170+ # Datetime column - pyspark with datetime objects
171+ (
172+ "pyspark" ,
173+ pd .to_datetime (
174+ ["2023-01-01" , "2023-01-02" , "2023-01-03" , "2023-01-02" , "2023-01-01" ]
175+ ).to_pydatetime (),
176+ 2 ,
177+ 4 ,
178+ "success" ,
179+ None ,
180+ "timestamp" ,
124181 ),
182+ # Negative integers
183+ ("pandas" , [- 10 , - 20 , - 30 , - 20 , - 10 ], 2 , 4 , "success" , None , "long" ),
184+ ("pyspark" , [- 10 , - 20 , - 30 , - 20 , - 10 ], 2 , 4 , "success" , None , "long" ),
185+ # Mixed positive and negative integers
186+ ("pandas" , [- 1 , 0 , 1 , 0 , - 1 ], 2 , 4 , "success" , None , "long" ),
187+ ("pyspark" , [- 1 , 0 , 1 , 0 , - 1 ], 2 , 4 , "success" , None , "long" ),
188+ # Large integers
189+ ("pandas" , [1000000 , 2000000 , 3000000 , 2000000 , 1000000 ], 2 , 4 , "success" , None , "long" ),
190+ ("pyspark" , [1000000 , 2000000 , 3000000 , 2000000 , 1000000 ], 2 , 4 , "success" , None , "long" ),
191+ # All null values
192+ ("pandas" , [None , None , None , None ], 0 , 1 , "success" , None , "long" ),
193+ ("pyspark" , [None , None , None , None ], 0 , 1 , "success" , None , "long" ),
125194 ],
126195 ids = [
127196 "pandas_success" ,
@@ -144,18 +213,35 @@ def test_expectation_name():
144213 "pyspark_too_few" ,
145214 "pandas_too_many" ,
146215 "pyspark_too_many" ,
216+ "pandas_string_with_nulls" ,
217+ "pyspark_string_with_nulls" ,
218+ "pandas_float" ,
219+ "pyspark_float" ,
220+ "pandas_boolean" ,
221+ "pyspark_boolean" ,
222+ "pandas_datetime" ,
223+ "pyspark_datetime" ,
224+ "pandas_negative_integers" ,
225+ "pyspark_negative_integers" ,
226+ "pandas_mixed_positive_negative" ,
227+ "pyspark_mixed_positive_negative" ,
228+ "pandas_large_integers" ,
229+ "pyspark_large_integers" ,
230+ "pandas_all_nulls" ,
231+ "pyspark_all_nulls" ,
147232 ],
148233)
149234def test_expectation_basic_scenarios (
150- df_type , df_data , min_value , max_value , expected_result , expected_message , spark
235+ df_type , df_data , min_value , max_value , expected_result , expected_message , data_type , spark
151236):
152237 """
153238 Test the expectation for various scenarios across pandas and PySpark DataFrames.
154239 Tests both direct expectation validation and suite-based validation.
155240 Covers: success cases, success with nulls, exact boundaries, edge cases (zero range, empty, single value),
156- too few values, and too many values.
241+ too few values, too many values, and various data types (string, float, boolean, datetime, negative integers,
242+ mixed positive/negative integers, large integers, all nulls).
157243 """
158- data_frame = create_dataframe (df_type , df_data , "col1" , spark )
244+ data_frame = create_dataframe (df_type , df_data , "col1" , spark , data_type )
159245
160246 # Test 1: Direct expectation validation
161247 expectation = DataFrameExpectationRegistry .get_expectation (
@@ -176,7 +262,7 @@ def test_expectation_basic_scenarios(
176262 else : # failure
177263 expected_failure_message = DataFrameExpectationFailureMessage (
178264 expectation_str = str (expectation ),
179- data_frame_type = get_df_type_enum (df_type ),
265+ data_frame_type = str (df_type ),
180266 message = expected_message ,
181267 )
182268 assert str (result ) == str (expected_failure_message ), (
@@ -264,46 +350,6 @@ def test_invalid_parameters(min_value, max_value, expected_error_message):
264350 )
265351
266352
267- @pytest .mark .parametrize (
268- "df_data, min_value, max_value, expected_distinct, description" ,
269- [
270- # String column with mixed values including None
271- (["A" , "B" , "C" , "B" , "A" , None ], 3 , 5 , 4 , "string_with_nulls" ),
272- # Float column
273- ([1.1 , 2.2 , 3.3 , 2.2 , 1.1 ], 2 , 4 , 3 , "float" ),
274- # Boolean column
275- ([True , False , True , False , True ], 2 , 2 , 2 , "boolean" ),
276- # Datetime column
277- (
278- pd .to_datetime (["2023-01-01" , "2023-01-02" , "2023-01-03" , "2023-01-02" , "2023-01-01" ]),
279- 2 ,
280- 4 ,
281- 3 ,
282- "datetime" ,
283- ),
284- ],
285- ids = ["string_with_nulls" , "float" , "boolean" , "datetime" ],
286- )
287- def test_data_type_validation (df_data , min_value , max_value , expected_distinct , description ):
288- """
289- Test the expectation with various data types.
290- Verifies that distinct value counting works correctly across different column types.
291- """
292- expectation = DataFrameExpectationRegistry .get_expectation (
293- expectation_name = "ExpectationDistinctColumnValuesBetween" ,
294- column_name = "col1" ,
295- min_value = min_value ,
296- max_value = max_value ,
297- )
298-
299- data_frame = pd .DataFrame ({"col1" : df_data })
300- result = expectation .validate (data_frame = data_frame )
301-
302- assert isinstance (result , DataFrameExpectationSuccessMessage ), (
303- f"Expected DataFrameExpectationSuccessMessage for { description } data type but got: { type (result )} "
304- )
305-
306-
307353def test_large_dataset_performance ():
308354 """
309355 Test the expectation with a larger dataset to ensure reasonable performance.
0 commit comments