Skip to content

Commit e0b8240

Browse files
authored
Merge pull request #12 from getyourguide/refactor-expectation-registration
This PR refactors the expectation registration system to address the growing size of the DataFrameExpectationsSuite class (previously 800+ lines of boilerplate). The new architecture replaces manual method definitions with dynamic generation from a central registry, while maintaining full IDE type-ahead support through auto-generated stub files.
2 parents 1b6d79a + d9578c2 commit e0b8240

File tree

21 files changed

+2249
-1183
lines changed

21 files changed

+2249
-1183
lines changed

.github/workflows/main.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ jobs:
2828
uv sync --group dev
2929
- name: Run sanity check
3030
run: |
31-
uv run python sanity_checks.py
32-
working-directory: dataframe_expectations
31+
uv run python scripts/sanity_checks.py
3332
- name: Run tests
3433
run: |
3534
uv run pytest tests/ --cov=dataframe_expectations

.pre-commit-config.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,18 @@ repos:
6060
]
6161
stages: [pre-commit, manual]
6262

63+
64+
# ============================================================================
65+
# Stub methods check - custom local hook
66+
# ============================================================================
67+
- repo: local
68+
hooks:
69+
- id: check-stub-methods
70+
name: Check expectation stubs are up-to-date
71+
entry: python scripts/generate_suite_stubs.py --check
72+
language: system
73+
pass_filenames: false
74+
6375
# ============================================================================
6476
# Commit message validation - Conventional Commits
6577
# ============================================================================

dataframe_expectations/expectations/aggregation_expectations/__init__.py

Whitespace-only changes.

dataframe_expectations/expectations/aggregation_expectations/any_value_expectations.py

Lines changed: 70 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
DataFrameAggregationExpectation,
1010
)
1111
from dataframe_expectations.expectations.expectation_registry import (
12+
ExpectationCategory,
13+
ExpectationSubcategory,
1214
register_expectation,
1315
)
1416
from dataframe_expectations.expectations.utils import requires_params
@@ -36,8 +38,7 @@ def __init__(self, min_rows: int):
3638
"""
3739
Initialize the minimum rows expectation.
3840
39-
Args:
40-
min_rows (int): Minimum number of rows required (inclusive).
41+
:param min_rows: Minimum number of rows required (inclusive).
4142
"""
4243
if min_rows < 0:
4344
raise ValueError(f"min_rows must be non-negative, got {min_rows}")
@@ -122,8 +123,7 @@ def __init__(self, max_rows: int):
122123
"""
123124
Initialize the maximum rows expectation.
124125
125-
Args:
126-
max_rows (int): Maximum number of rows allowed (inclusive).
126+
:param max_rows: Maximum number of rows allowed (inclusive).
127127
"""
128128
if max_rows < 0:
129129
raise ValueError(f"max_rows must be non-negative, got {max_rows}")
@@ -212,9 +212,8 @@ def __init__(self, column_name: str, max_percentage: float):
212212
"""
213213
Initialize the maximum null percentage expectation.
214214
215-
Args:
216-
column_name (str): Name of the column to check for null percentage.
217-
max_percentage (float): Maximum percentage of null values allowed (0.0-100.0).
215+
:param column_name: Name of the column to check for null percentage.
216+
:param max_percentage: Maximum percentage of null values allowed (0.0-100.0).
218217
"""
219218
if not 0 <= max_percentage <= 100:
220219
raise ValueError(f"max_percentage must be between 0.0 and 100.0, got {max_percentage}")
@@ -330,9 +329,8 @@ def __init__(self, column_name: str, max_count: int):
330329
"""
331330
Initialize the maximum null count expectation.
332331
333-
Args:
334-
column_name (str): Name of the column to check for null count.
335-
max_count (int): Maximum number of null values allowed.
332+
:param column_name: Name of the column to check for null count.
333+
:param max_count: Maximum number of null values allowed.
336334
"""
337335
if max_count < 0:
338336
raise ValueError(f"max_count must be non-negative, got {max_count}")
@@ -414,77 +412,101 @@ def aggregate_and_validate_pyspark(
414412

415413

416414
# Factory functions for the registry
417-
@register_expectation("ExpectationMinRows")
415+
@register_expectation(
416+
"ExpectationMinRows",
417+
pydoc="Check if the DataFrame has at least a minimum number of rows",
418+
category=ExpectationCategory.DATAFRAME_AGGREGATION_EXPECTATIONS,
419+
subcategory=ExpectationSubcategory.ANY_VALUE,
420+
params_doc={
421+
"min_rows": "The minimum number of rows expected",
422+
},
423+
)
418424
@requires_params("min_rows", types={"min_rows": int})
419-
def create_expectation_min_rows(**kwargs) -> ExpectationMinRows:
425+
def create_expectation_min_rows(min_rows: int) -> ExpectationMinRows:
420426
"""
421427
Create an ExpectMinRows instance.
422428
423-
Args:
424-
min_rows (int): Minimum number of rows required.
425-
426-
Returns:
427-
ExpectationMinRows: A configured expectation instance.
429+
:param min_rows: Minimum number of rows required.
430+
:return: A configured expectation instance.
428431
"""
429-
return ExpectationMinRows(min_rows=kwargs["min_rows"])
432+
return ExpectationMinRows(min_rows=min_rows)
430433

431434

432-
@register_expectation("ExpectationMaxRows")
435+
@register_expectation(
436+
"ExpectationMaxRows",
437+
pydoc="Check if the DataFrame has at most a maximum number of rows",
438+
category=ExpectationCategory.DATAFRAME_AGGREGATION_EXPECTATIONS,
439+
subcategory=ExpectationSubcategory.ANY_VALUE,
440+
params_doc={
441+
"max_rows": "The maximum number of rows expected",
442+
},
443+
)
433444
@requires_params("max_rows", types={"max_rows": int})
434-
def create_expectation_max_rows(**kwargs) -> ExpectationMaxRows:
445+
def create_expectation_max_rows(max_rows: int) -> ExpectationMaxRows:
435446
"""
436447
Create an ExpectationMaxRows instance.
437448
438-
Args:
439-
max_rows (int): Maximum number of rows allowed.
440-
441-
Returns:
442-
ExpectationMaxRows: A configured expectation instance.
449+
:param max_rows: Maximum number of rows allowed.
450+
:return: A configured expectation instance.
443451
"""
444-
return ExpectationMaxRows(max_rows=kwargs["max_rows"])
445-
446-
447-
@register_expectation("ExpectationMaxNullPercentage")
452+
return ExpectationMaxRows(max_rows=max_rows)
453+
454+
455+
@register_expectation(
456+
"ExpectationMaxNullPercentage",
457+
pydoc="Check if the percentage of null/NaN values in a specific column is below a threshold",
458+
category=ExpectationCategory.COLUMN_AGGREGATION_EXPECTATIONS,
459+
subcategory=ExpectationSubcategory.ANY_VALUE,
460+
params_doc={
461+
"column_name": "The name of the column to check for null percentage",
462+
"max_percentage": "The maximum allowed percentage of null/NaN values (0.0 to 100.0)",
463+
},
464+
)
448465
@requires_params(
449466
"column_name",
450467
"max_percentage",
451468
types={"column_name": str, "max_percentage": (int, float)},
452469
)
453-
def create_expectation_max_null_percentage(**kwargs) -> ExpectationMaxNullPercentage:
470+
def create_expectation_max_null_percentage(
471+
column_name: str, max_percentage: float
472+
) -> ExpectationMaxNullPercentage:
454473
"""
455474
Create an ExpectationMaxNullPercentage instance.
456475
457-
Args:
458-
column_name (str): Name of the column to check for null percentage.
459-
max_percentage (float): Maximum percentage of null values allowed (0.0-100.0).
460-
461-
Returns:
462-
ExpectationMaxNullPercentage: A configured expectation instance.
476+
:param column_name: Name of the column to check for null percentage.
477+
:param max_percentage: Maximum percentage of null values allowed (0.0-100.0).
478+
:return: A configured expectation instance.
463479
"""
464480
return ExpectationMaxNullPercentage(
465-
column_name=kwargs["column_name"],
466-
max_percentage=kwargs["max_percentage"],
481+
column_name=column_name,
482+
max_percentage=max_percentage,
467483
)
468484

469485

470-
@register_expectation("ExpectationMaxNullCount")
486+
@register_expectation(
487+
"ExpectationMaxNullCount",
488+
pydoc="Check if the count of null/NaN values in a specific column is below a threshold",
489+
category=ExpectationCategory.COLUMN_AGGREGATION_EXPECTATIONS,
490+
subcategory=ExpectationSubcategory.ANY_VALUE,
491+
params_doc={
492+
"column_name": "The name of the column to check for null count",
493+
"max_count": "The maximum allowed count of null/NaN values",
494+
},
495+
)
471496
@requires_params(
472497
"column_name",
473498
"max_count",
474499
types={"column_name": str, "max_count": int},
475500
)
476-
def create_expectation_max_null_count(**kwargs) -> ExpectationMaxNullCount:
501+
def create_expectation_max_null_count(column_name: str, max_count: int) -> ExpectationMaxNullCount:
477502
"""
478503
Create an ExpectationMaxNullCount instance.
479504
480-
Args:
481-
column_name (str): Name of the column to check for null count.
482-
max_count (int): Maximum number of null values allowed.
483-
484-
Returns:
485-
ExpectationMaxNullCount: A configured expectation instance.
505+
:param column_name: Name of the column to check for null count.
506+
:param max_count: Maximum number of null values allowed.
507+
:return: A configured expectation instance.
486508
"""
487509
return ExpectationMaxNullCount(
488-
column_name=kwargs["column_name"],
489-
max_count=kwargs["max_count"],
510+
column_name=column_name,
511+
max_count=max_count,
490512
)

0 commit comments

Comments
 (0)