Skip to content

Commit 55933e8

Browse files
committed
test: standardised unit tests
1 parent 8dc2491 commit 55933e8

31 files changed

+3699
-2075
lines changed

tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_between.py

Lines changed: 115 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,40 @@
1515
)
1616

1717

18-
def create_dataframe(df_type, data, column_name, spark):
19-
"""Helper function to create pandas or pyspark DataFrame."""
18+
def create_dataframe(df_type, data, column_name, spark, data_type="long"):
19+
"""Helper function to create pandas or pyspark DataFrame.
20+
21+
Args:
22+
df_type: "pandas" or "pyspark"
23+
data: List of values for the column
24+
column_name: Name of the column
25+
spark: Spark session (required for pyspark)
26+
data_type: Data type for the column - "long", "string", "double", "boolean", "timestamp"
27+
"""
2028
if df_type == "pandas":
2129
return pd.DataFrame(data, columns=[column_name])
2230
else: # pyspark
23-
# Use explicit schema for all PySpark DataFrames (required for empty DataFrames)
24-
from pyspark.sql.types import StructType, StructField, LongType
25-
26-
schema = StructType([StructField(column_name, LongType(), True)])
27-
return spark.createDataFrame([(val,) for val in data], schema)
31+
# Use explicit schema for all PySpark DataFrames
32+
from pyspark.sql.types import (
33+
StructType,
34+
StructField,
35+
LongType,
36+
StringType,
37+
DoubleType,
38+
BooleanType,
39+
TimestampType,
40+
)
2841

42+
type_mapping = {
43+
"long": LongType(),
44+
"string": StringType(),
45+
"double": DoubleType(),
46+
"boolean": BooleanType(),
47+
"timestamp": TimestampType(),
48+
}
2949

30-
def get_df_type_enum(df_type):
31-
"""Get DataFrameType enum value."""
32-
return DataFrameType.PANDAS if df_type == "pandas" else DataFrameType.PYSPARK
50+
schema = StructType([StructField(column_name, type_mapping[data_type], True)])
51+
return spark.createDataFrame([(val,) for val in data], schema)
3352

3453

3554
def test_expectation_name():
@@ -48,23 +67,23 @@ def test_expectation_name():
4867

4968

5069
@pytest.mark.parametrize(
51-
"df_type, df_data, min_value, max_value, expected_result, expected_message",
70+
"df_type, df_data, min_value, max_value, expected_result, expected_message, data_type",
5271
[
5372
# Basic success - 3 distinct values within range [2, 5]
54-
("pandas", [1, 2, 3, 2, 1], 2, 5, "success", None),
55-
("pyspark", [1, 2, 3, 2, 1], 2, 5, "success", None),
73+
("pandas", [1, 2, 3, 2, 1], 2, 5, "success", None, "long"),
74+
("pyspark", [1, 2, 3, 2, 1], 2, 5, "success", None, "long"),
5675
# Success with nulls - 3 distinct values [1, 2, None] within range [3, 4]
57-
("pandas", [1, 2, None, 2, 1], 3, 4, "success", None),
58-
("pyspark", [1, 2, None, 2, 1], 3, 4, "success", None),
76+
("pandas", [1, 2, None, 2, 1], 3, 4, "success", None, "long"),
77+
("pyspark", [1, 2, None, 2, 1], 3, 4, "success", None, "long"),
5978
# Exact minimum boundary - 3 distinct values at min boundary [3, 5]
60-
("pandas", [1, 2, 3, 2, 1], 3, 5, "success", None),
61-
("pyspark", [1, 2, 3, 2, 1], 3, 5, "success", None),
79+
("pandas", [1, 2, 3, 2, 1], 3, 5, "success", None, "long"),
80+
("pyspark", [1, 2, 3, 2, 1], 3, 5, "success", None, "long"),
6281
# Exact maximum boundary - 5 distinct values at max boundary [3, 5]
63-
("pandas", [1, 2, 3, 4, 5, 1], 3, 5, "success", None),
64-
("pyspark", [1, 2, 3, 4, 5, 1], 3, 5, "success", None),
82+
("pandas", [1, 2, 3, 4, 5, 1], 3, 5, "success", None, "long"),
83+
("pyspark", [1, 2, 3, 4, 5, 1], 3, 5, "success", None, "long"),
6584
# Edge case: zero range (min == max) - success with exact match
66-
("pandas", [1, 2, 3, 2, 1], 3, 3, "success", None),
67-
("pyspark", [1, 2, 3, 2, 1], 3, 3, "success", None),
85+
("pandas", [1, 2, 3, 2, 1], 3, 3, "success", None, "long"),
86+
("pyspark", [1, 2, 3, 2, 1], 3, 3, "success", None, "long"),
6887
# Edge case: zero range (min == max) - failure when not exact
6988
(
7089
"pandas",
@@ -73,6 +92,7 @@ def test_expectation_name():
7392
3,
7493
"failure",
7594
"Column 'col1' has 2 distinct values, expected between 3 and 3.",
95+
"long",
7696
),
7797
(
7898
"pyspark",
@@ -81,13 +101,14 @@ def test_expectation_name():
81101
3,
82102
"failure",
83103
"Column 'col1' has 2 distinct values, expected between 3 and 3.",
104+
"long",
84105
),
85106
# Edge case: empty DataFrame - 0 distinct values within range [0, 5]
86-
("pandas", [], 0, 5, "success", None),
87-
("pyspark", [], 0, 5, "success", None),
107+
("pandas", [], 0, 5, "success", None, "long"),
108+
("pyspark", [], 0, 5, "success", None, "long"),
88109
# Edge case: single distinct value - 1 distinct value within range [1, 1]
89-
("pandas", [1, 1, 1, 1, 1], 1, 1, "success", None),
90-
("pyspark", [1, 1, 1, 1, 1], 1, 1, "success", None),
110+
("pandas", [1, 1, 1, 1, 1], 1, 1, "success", None, "long"),
111+
("pyspark", [1, 1, 1, 1, 1], 1, 1, "success", None, "long"),
91112
# Too few distinct values - 2 distinct, expecting [4, 6]
92113
(
93114
"pandas",
@@ -96,6 +117,7 @@ def test_expectation_name():
96117
6,
97118
"failure",
98119
"Column 'col1' has 2 distinct values, expected between 4 and 6.",
120+
"long",
99121
),
100122
(
101123
"pyspark",
@@ -104,6 +126,7 @@ def test_expectation_name():
104126
6,
105127
"failure",
106128
"Column 'col1' has 2 distinct values, expected between 4 and 6.",
129+
"long",
107130
),
108131
# Too many distinct values - 5 distinct, expecting [2, 3]
109132
(
@@ -113,6 +136,7 @@ def test_expectation_name():
113136
3,
114137
"failure",
115138
"Column 'col1' has 5 distinct values, expected between 2 and 3.",
139+
"long",
116140
),
117141
(
118142
"pyspark",
@@ -121,7 +145,52 @@ def test_expectation_name():
121145
3,
122146
"failure",
123147
"Column 'col1' has 5 distinct values, expected between 2 and 3.",
148+
"long",
149+
),
150+
# Data type validation scenarios (consolidated from test_data_type_validation)
151+
# String column with mixed values including None
152+
("pandas", ["A", "B", "C", "B", "A", None], 3, 5, "success", None, "string"),
153+
("pyspark", ["A", "B", "C", "B", "A", None], 3, 5, "success", None, "string"),
154+
# Float column
155+
("pandas", [1.1, 2.2, 3.3, 2.2, 1.1], 2, 4, "success", None, "double"),
156+
("pyspark", [1.1, 2.2, 3.3, 2.2, 1.1], 2, 4, "success", None, "double"),
157+
# Boolean column
158+
("pandas", [True, False, True, False, True], 2, 2, "success", None, "boolean"),
159+
("pyspark", [True, False, True, False, True], 2, 2, "success", None, "boolean"),
160+
# Datetime column - pandas with pd.to_datetime
161+
(
162+
"pandas",
163+
pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-02", "2023-01-01"]),
164+
2,
165+
4,
166+
"success",
167+
None,
168+
"timestamp",
169+
),
170+
# Datetime column - pyspark with datetime objects
171+
(
172+
"pyspark",
173+
pd.to_datetime(
174+
["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-02", "2023-01-01"]
175+
).to_pydatetime(),
176+
2,
177+
4,
178+
"success",
179+
None,
180+
"timestamp",
124181
),
182+
# Negative integers
183+
("pandas", [-10, -20, -30, -20, -10], 2, 4, "success", None, "long"),
184+
("pyspark", [-10, -20, -30, -20, -10], 2, 4, "success", None, "long"),
185+
# Mixed positive and negative integers
186+
("pandas", [-1, 0, 1, 0, -1], 2, 4, "success", None, "long"),
187+
("pyspark", [-1, 0, 1, 0, -1], 2, 4, "success", None, "long"),
188+
# Large integers
189+
("pandas", [1000000, 2000000, 3000000, 2000000, 1000000], 2, 4, "success", None, "long"),
190+
("pyspark", [1000000, 2000000, 3000000, 2000000, 1000000], 2, 4, "success", None, "long"),
191+
# All null values
192+
("pandas", [None, None, None, None], 0, 1, "success", None, "long"),
193+
("pyspark", [None, None, None, None], 0, 1, "success", None, "long"),
125194
],
126195
ids=[
127196
"pandas_success",
@@ -144,18 +213,35 @@ def test_expectation_name():
144213
"pyspark_too_few",
145214
"pandas_too_many",
146215
"pyspark_too_many",
216+
"pandas_string_with_nulls",
217+
"pyspark_string_with_nulls",
218+
"pandas_float",
219+
"pyspark_float",
220+
"pandas_boolean",
221+
"pyspark_boolean",
222+
"pandas_datetime",
223+
"pyspark_datetime",
224+
"pandas_negative_integers",
225+
"pyspark_negative_integers",
226+
"pandas_mixed_positive_negative",
227+
"pyspark_mixed_positive_negative",
228+
"pandas_large_integers",
229+
"pyspark_large_integers",
230+
"pandas_all_nulls",
231+
"pyspark_all_nulls",
147232
],
148233
)
149234
def test_expectation_basic_scenarios(
150-
df_type, df_data, min_value, max_value, expected_result, expected_message, spark
235+
df_type, df_data, min_value, max_value, expected_result, expected_message, data_type, spark
151236
):
152237
"""
153238
Test the expectation for various scenarios across pandas and PySpark DataFrames.
154239
Tests both direct expectation validation and suite-based validation.
155240
Covers: success cases, success with nulls, exact boundaries, edge cases (zero range, empty, single value),
156-
too few values, and too many values.
241+
too few values, too many values, and various data types (string, float, boolean, datetime, negative integers,
242+
mixed positive/negative integers, large integers, all nulls).
157243
"""
158-
data_frame = create_dataframe(df_type, df_data, "col1", spark)
244+
data_frame = create_dataframe(df_type, df_data, "col1", spark, data_type)
159245

160246
# Test 1: Direct expectation validation
161247
expectation = DataFrameExpectationRegistry.get_expectation(
@@ -176,7 +262,7 @@ def test_expectation_basic_scenarios(
176262
else: # failure
177263
expected_failure_message = DataFrameExpectationFailureMessage(
178264
expectation_str=str(expectation),
179-
data_frame_type=get_df_type_enum(df_type),
265+
data_frame_type=str(df_type),
180266
message=expected_message,
181267
)
182268
assert str(result) == str(expected_failure_message), (
@@ -264,46 +350,6 @@ def test_invalid_parameters(min_value, max_value, expected_error_message):
264350
)
265351

266352

267-
@pytest.mark.parametrize(
268-
"df_data, min_value, max_value, expected_distinct, description",
269-
[
270-
# String column with mixed values including None
271-
(["A", "B", "C", "B", "A", None], 3, 5, 4, "string_with_nulls"),
272-
# Float column
273-
([1.1, 2.2, 3.3, 2.2, 1.1], 2, 4, 3, "float"),
274-
# Boolean column
275-
([True, False, True, False, True], 2, 2, 2, "boolean"),
276-
# Datetime column
277-
(
278-
pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-02", "2023-01-01"]),
279-
2,
280-
4,
281-
3,
282-
"datetime",
283-
),
284-
],
285-
ids=["string_with_nulls", "float", "boolean", "datetime"],
286-
)
287-
def test_data_type_validation(df_data, min_value, max_value, expected_distinct, description):
288-
"""
289-
Test the expectation with various data types.
290-
Verifies that distinct value counting works correctly across different column types.
291-
"""
292-
expectation = DataFrameExpectationRegistry.get_expectation(
293-
expectation_name="ExpectationDistinctColumnValuesBetween",
294-
column_name="col1",
295-
min_value=min_value,
296-
max_value=max_value,
297-
)
298-
299-
data_frame = pd.DataFrame({"col1": df_data})
300-
result = expectation.validate(data_frame=data_frame)
301-
302-
assert isinstance(result, DataFrameExpectationSuccessMessage), (
303-
f"Expected DataFrameExpectationSuccessMessage for {description} data type but got: {type(result)}"
304-
)
305-
306-
307353
def test_large_dataset_performance():
308354
"""
309355
Test the expectation with a larger dataset to ensure reasonable performance.

tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_equals.py

Lines changed: 24 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,6 @@ def create_dataframe(df_type, data, column_name, spark, data_type="long"):
5252
return spark.createDataFrame([(val,) for val in data], schema)
5353

5454

55-
def get_df_type_enum(df_type):
56-
"""Get DataFrameType enum value."""
57-
return DataFrameType.PANDAS if df_type == "pandas" else DataFrameType.PYSPARK
58-
59-
6055
def test_expectation_name():
6156
"""
6257
Test that the expectation name is correctly returned.
@@ -227,29 +222,29 @@ def test_expectation_name():
227222
"pyspark_empty",
228223
"pandas_single_value",
229224
"pyspark_single_value",
230-
"string_with_nulls_pandas",
231-
"string_with_nulls_pyspark",
232-
"string_case_sensitive_pandas",
233-
"string_case_sensitive_pyspark",
234-
"float_pandas",
235-
"float_pyspark",
236-
"numeric_precision_pandas",
237-
"numeric_precision_pyspark",
238-
"boolean_pandas",
239-
"boolean_pyspark",
240-
"boolean_with_none_pandas",
241-
"boolean_with_none_pyspark",
242-
"datetime_pandas",
243-
"datetime_pyspark",
244-
"datetime_with_timezone_pandas",
245-
"datetime_with_timezone_pyspark",
246-
"mixed_data_types",
247-
"categorical",
248-
"duplicate_nan_handling_pandas",
249-
"duplicate_nan_handling_pyspark",
250-
"string_whitespace_pandas",
251-
"string_whitespace_pyspark",
252-
"numeric_string_vs_numeric",
225+
"pandas_string_with_nulls",
226+
"pyspark_string_with_nulls",
227+
"pandas_string_case_sensitive",
228+
"pyspark_string_case_sensitive",
229+
"pandas_float",
230+
"pyspark_float",
231+
"pandas_numeric_precision",
232+
"pyspark_numeric_precision",
233+
"pandas_boolean",
234+
"pyspark_boolean",
235+
"pandas_boolean_with_none",
236+
"pyspark_boolean_with_none",
237+
"pandas_datetime",
238+
"pyspark_datetime",
239+
"pandas_datetime_with_timezone",
240+
"pyspark_datetime_with_timezone",
241+
"pandas_mixed_data_types",
242+
"pandas_categorical",
243+
"pandas_duplicate_nan_handling",
244+
"pyspark_duplicate_nan_handling",
245+
"pandas_string_whitespace",
246+
"pyspark_string_whitespace",
247+
"pandas_numeric_string_vs_numeric",
253248
],
254249
)
255250
def test_expectation_basic_scenarios(
@@ -285,7 +280,7 @@ def test_expectation_basic_scenarios(
285280
else: # failure
286281
expected_failure_message = DataFrameExpectationFailureMessage(
287282
expectation_str=str(expectation),
288-
data_frame_type=get_df_type_enum(df_type),
283+
data_frame_type=str(df_type),
289284
message=expected_message,
290285
)
291286
assert str(result) == str(expected_failure_message), (

0 commit comments

Comments
 (0)