Skip to content

Commit 111bca1

Browse files
authored
Merge pull request #21 from getyourguide/refactor/restructure-codebase
feat: restructure codebase, and registry refactoring
2 parents 5567760 + 276589d commit 111bca1

File tree

70 files changed

+495
-413
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+495
-413
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ uv run pytest tests/ --cov=dataframe_expectations
5555

5656
**Basic usage with Pandas:**
5757
```python
58-
from dataframe_expectations.expectations_suite import DataFrameExpectationsSuite
58+
from dataframe_expectations.suite import DataFrameExpectationsSuite
5959
import pandas as pd
6060

6161
# Build a suite with expectations
@@ -82,7 +82,7 @@ runner.run(df)
8282

8383
**PySpark example:**
8484
```python
85-
from dataframe_expectations.expectations_suite import DataFrameExpectationsSuite
85+
from dataframe_expectations.suite import DataFrameExpectationsSuite
8686
from pyspark.sql import SparkSession
8787

8888
# Initialize Spark session
@@ -116,7 +116,7 @@ runner.run(df)
116116

117117
**Decorator pattern for automatic validation:**
118118
```python
119-
from dataframe_expectations.expectations_suite import DataFrameExpectationsSuite
119+
from dataframe_expectations.suite import DataFrameExpectationsSuite
120120
from pyspark.sql import SparkSession
121121

122122
# Initialize Spark session

dataframe_expectations/__init__.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
1-
from enum import Enum
2-
from typing import Union
1+
"""DataFrame Expectations - A validation library for pandas and PySpark DataFrames."""
32

4-
from pandas import DataFrame as PandasDataFrame
5-
from pyspark.sql import DataFrame as PySparkDataFrame
3+
try:
4+
from importlib.metadata import version
65

7-
DataFrameLike = Union[PySparkDataFrame, PandasDataFrame]
6+
__version__ = version("dataframe-expectations")
7+
except Exception:
8+
# Package is not installed (e.g., during development or linting)
9+
# Catch all exceptions to handle various edge cases in different environments
10+
__version__ = "0.0.0.dev0"
811

9-
10-
class DataFrameType(str, Enum):
11-
"""
12-
Enum for DataFrame types.
13-
"""
14-
15-
PANDAS = "pandas"
16-
PYSPARK = "pyspark"
12+
__all__ = []
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Core base classes and interfaces for DataFrame expectations."""
2+
3+
__all__ = []

dataframe_expectations/expectations/aggregation_expectation.py renamed to dataframe_expectations/core/aggregation_expectation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from abc import abstractmethod
22
from typing import List, Union
33

4-
from dataframe_expectations import DataFrameLike, DataFrameType
5-
from dataframe_expectations.expectations import DataFrameExpectation
4+
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
5+
from dataframe_expectations.core.expectation import DataFrameExpectation
66
from dataframe_expectations.result_message import (
77
DataFrameExpectationFailureMessage,
88
DataFrameExpectationResultMessage,

dataframe_expectations/expectations/column_expectation.py renamed to dataframe_expectations/core/column_expectation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Callable
22

3-
from dataframe_expectations import DataFrameLike, DataFrameType
4-
from dataframe_expectations.expectations import DataFrameExpectation
3+
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
4+
from dataframe_expectations.core.expectation import DataFrameExpectation
55
from dataframe_expectations.result_message import (
66
DataFrameExpectationFailureMessage,
77
DataFrameExpectationResultMessage,
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from abc import ABC, abstractmethod
2+
from typing import cast
3+
4+
from pandas import DataFrame as PandasDataFrame
5+
from pyspark.sql import DataFrame as PySparkDataFrame
6+
7+
# Import the connect DataFrame type for Spark Connect
8+
try:
9+
from pyspark.sql.connect.dataframe import DataFrame as PySparkConnectDataFrame
10+
except ImportError:
11+
# Fallback for older PySpark versions that don't have connect
12+
PySparkConnectDataFrame = None # type: ignore[misc,assignment]
13+
14+
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
15+
from dataframe_expectations.result_message import (
16+
DataFrameExpectationResultMessage,
17+
)
18+
19+
20+
class DataFrameExpectation(ABC):
21+
"""
22+
Base class for DataFrame expectations.
23+
"""
24+
25+
def get_expectation_name(self) -> str:
26+
"""
27+
Returns the class name as the expectation name.
28+
"""
29+
return type(self).__name__
30+
31+
@abstractmethod
32+
def get_description(self) -> str:
33+
"""
34+
Returns a description of the expectation.
35+
"""
36+
raise NotImplementedError(
37+
f"description method must be implemented for {self.__class__.__name__}"
38+
)
39+
40+
def __str__(self):
41+
"""
42+
Returns a string representation of the expectation.
43+
"""
44+
return f"{self.get_expectation_name()} ({self.get_description()})"
45+
46+
@classmethod
47+
def infer_data_frame_type(cls, data_frame: DataFrameLike) -> DataFrameType:
48+
"""
49+
Infer the DataFrame type based on the provided DataFrame.
50+
"""
51+
if isinstance(data_frame, PandasDataFrame):
52+
return DataFrameType.PANDAS
53+
elif isinstance(data_frame, PySparkDataFrame):
54+
return DataFrameType.PYSPARK
55+
elif PySparkConnectDataFrame is not None and isinstance(
56+
data_frame, PySparkConnectDataFrame
57+
):
58+
return DataFrameType.PYSPARK
59+
else:
60+
raise ValueError(f"Unsupported DataFrame type: {type(data_frame)}")
61+
62+
def validate(self, data_frame: DataFrameLike, **kwargs):
63+
"""
64+
Validate the DataFrame against the expectation.
65+
"""
66+
data_frame_type = self.infer_data_frame_type(data_frame)
67+
68+
if data_frame_type == DataFrameType.PANDAS:
69+
return self.validate_pandas(data_frame=data_frame, **kwargs)
70+
elif data_frame_type == DataFrameType.PYSPARK:
71+
return self.validate_pyspark(data_frame=data_frame, **kwargs)
72+
else:
73+
raise ValueError(f"Unsupported DataFrame type: {data_frame_type}")
74+
75+
@abstractmethod
76+
def validate_pandas(
77+
self, data_frame: DataFrameLike, **kwargs
78+
) -> DataFrameExpectationResultMessage:
79+
"""
80+
Validate a pandas DataFrame against the expectation.
81+
"""
82+
raise NotImplementedError(
83+
f"validate_pandas method must be implemented for {self.__class__.__name__}"
84+
)
85+
86+
@abstractmethod
87+
def validate_pyspark(
88+
self, data_frame: DataFrameLike, **kwargs
89+
) -> DataFrameExpectationResultMessage:
90+
"""
91+
Validate a PySpark DataFrame against the expectation.
92+
"""
93+
raise NotImplementedError(
94+
f"validate_pyspark method must be implemented for {self.__class__.__name__}"
95+
)
96+
97+
@classmethod
98+
def num_data_frame_rows(cls, data_frame: DataFrameLike) -> int:
99+
"""
100+
Count the number of rows in the DataFrame.
101+
"""
102+
data_frame_type = cls.infer_data_frame_type(data_frame)
103+
if data_frame_type == DataFrameType.PANDAS:
104+
# Cast to PandasDataFrame since we know it's a Pandas DataFrame at this point
105+
return len(cast(PandasDataFrame, data_frame))
106+
elif data_frame_type == DataFrameType.PYSPARK:
107+
# Cast to PySparkDataFrame since we know it's a PySpark DataFrame at this point
108+
return cast(PySparkDataFrame, data_frame).count()
109+
else:
110+
raise ValueError(f"Unsupported DataFrame type: {data_frame_type}")
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Core types, enums, and data models for dataframe-expectations."""
2+
3+
from enum import Enum
4+
from typing import Any, Dict, Union
5+
6+
from pandas import DataFrame as PandasDataFrame
7+
from pydantic import BaseModel, ConfigDict, Field
8+
from pyspark.sql import DataFrame as PySparkDataFrame
9+
10+
# Type aliases
11+
DataFrameLike = Union[PySparkDataFrame, PandasDataFrame]
12+
13+
14+
class DataFrameType(str, Enum):
15+
"""Enum for DataFrame types."""
16+
17+
PANDAS = "pandas"
18+
PYSPARK = "pyspark"
19+
20+
21+
class ExpectationCategory(str, Enum):
22+
"""Categories for expectations."""
23+
24+
COLUMN_EXPECTATIONS = "Column Expectations"
25+
COLUMN_AGGREGATION_EXPECTATIONS = "Column Aggregation Expectations"
26+
DATAFRAME_AGGREGATION_EXPECTATIONS = "DataFrame Aggregation Expectations"
27+
28+
29+
class ExpectationSubcategory(str, Enum):
30+
"""Subcategory of expectations."""
31+
32+
ANY_VALUE = "Any Value"
33+
NUMERICAL = "Numerical"
34+
STRING = "String"
35+
UNIQUE = "Unique"
36+
37+
38+
class ExpectationMetadata(BaseModel):
39+
"""Metadata for a registered expectation."""
40+
41+
suite_method_name: str = Field(
42+
..., description="Method name in ExpectationsSuite (e.g., 'expect_value_greater_than')"
43+
)
44+
pydoc: str = Field(..., description="Human-readable description of the expectation")
45+
category: ExpectationCategory = Field(..., description="Category (e.g., 'Column Expectations')")
46+
subcategory: ExpectationSubcategory = Field(
47+
..., description="Subcategory (e.g., 'Numerical', 'String')"
48+
)
49+
params_doc: Dict[str, str] = Field(..., description="Documentation for each parameter")
50+
params: list = Field(default_factory=list, description="List of required parameter names")
51+
param_types: Dict[str, Any] = Field(
52+
default_factory=dict, description="Type hints for parameters"
53+
)
54+
factory_func_name: str = Field(..., description="Name of the factory function")
55+
expectation_name: str = Field(..., description="Name of the expectation class")
56+
57+
model_config = ConfigDict(frozen=True) # Make model immutable

dataframe_expectations/expectations/utils.py renamed to dataframe_expectations/core/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from functools import wraps
22
from typing import Any, Callable, Dict, Optional, Tuple, Type, Union, get_args
33

4-
from dataframe_expectations.expectations import DataFrameExpectation
4+
from dataframe_expectations.core.expectation import DataFrameExpectation
55

66

77
def requires_params(
Lines changed: 1 addition & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1 @@
1-
from abc import ABC, abstractmethod
2-
from typing import cast
3-
4-
from pandas import DataFrame as PandasDataFrame
5-
from pyspark.sql import DataFrame as PySparkDataFrame
6-
7-
# Import the connect DataFrame type for Spark Connect
8-
try:
9-
from pyspark.sql.connect.dataframe import DataFrame as PySparkConnectDataFrame
10-
except ImportError:
11-
# Fallback for older PySpark versions that don't have connect
12-
PySparkConnectDataFrame = None # type: ignore[misc,assignment]
13-
14-
from dataframe_expectations import DataFrameLike, DataFrameType
15-
from dataframe_expectations.result_message import (
16-
DataFrameExpectationResultMessage,
17-
)
18-
19-
20-
class DataFrameExpectation(ABC):
21-
"""
22-
Base class for DataFrame expectations.
23-
"""
24-
25-
def get_expectation_name(self) -> str:
26-
"""
27-
Returns the class name as the expectation name.
28-
"""
29-
return type(self).__name__
30-
31-
@abstractmethod
32-
def get_description(self) -> str:
33-
"""
34-
Returns a description of the expectation.
35-
"""
36-
raise NotImplementedError(
37-
f"description method must be implemented for {self.__class__.__name__}"
38-
)
39-
40-
def __str__(self):
41-
"""
42-
Returns a string representation of the expectation.
43-
"""
44-
return f"{self.get_expectation_name()} ({self.get_description()})"
45-
46-
@classmethod
47-
def infer_data_frame_type(cls, data_frame: DataFrameLike) -> DataFrameType:
48-
"""
49-
Infer the DataFrame type based on the provided DataFrame.
50-
"""
51-
if isinstance(data_frame, PandasDataFrame):
52-
return DataFrameType.PANDAS
53-
elif isinstance(data_frame, PySparkDataFrame):
54-
return DataFrameType.PYSPARK
55-
elif PySparkConnectDataFrame is not None and isinstance(
56-
data_frame, PySparkConnectDataFrame
57-
):
58-
return DataFrameType.PYSPARK
59-
else:
60-
raise ValueError(f"Unsupported DataFrame type: {type(data_frame)}")
61-
62-
def validate(self, data_frame: DataFrameLike, **kwargs):
63-
"""
64-
Validate the DataFrame against the expectation.
65-
"""
66-
data_frame_type = self.infer_data_frame_type(data_frame)
67-
68-
if data_frame_type == DataFrameType.PANDAS:
69-
return self.validate_pandas(data_frame=data_frame, **kwargs)
70-
elif data_frame_type == DataFrameType.PYSPARK:
71-
return self.validate_pyspark(data_frame=data_frame, **kwargs)
72-
else:
73-
raise ValueError(f"Unsupported DataFrame type: {data_frame_type}")
74-
75-
@abstractmethod
76-
def validate_pandas(
77-
self, data_frame: DataFrameLike, **kwargs
78-
) -> DataFrameExpectationResultMessage:
79-
"""
80-
Validate a pandas DataFrame against the expectation.
81-
"""
82-
raise NotImplementedError(
83-
f"validate_pandas method must be implemented for {self.__class__.__name__}"
84-
)
85-
86-
@abstractmethod
87-
def validate_pyspark(
88-
self, data_frame: DataFrameLike, **kwargs
89-
) -> DataFrameExpectationResultMessage:
90-
"""
91-
Validate a PySpark DataFrame against the expectation.
92-
"""
93-
raise NotImplementedError(
94-
f"validate_pyspark method must be implemented for {self.__class__.__name__}"
95-
)
96-
97-
@classmethod
98-
def num_data_frame_rows(cls, data_frame: DataFrameLike) -> int:
99-
"""
100-
Count the number of rows in the DataFrame.
101-
"""
102-
data_frame_type = cls.infer_data_frame_type(data_frame)
103-
if data_frame_type == DataFrameType.PANDAS:
104-
# Cast to PandasDataFrame since we know it's a Pandas DataFrame at this point
105-
return len(cast(PandasDataFrame, data_frame))
106-
elif data_frame_type == DataFrameType.PYSPARK:
107-
# Cast to PySparkDataFrame since we know it's a PySpark DataFrame at this point
108-
return cast(PySparkDataFrame, data_frame).count()
109-
else:
110-
raise ValueError(f"Unsupported DataFrame type: {data_frame_type}")
1+
"""Expectations package - contains all expectation implementations."""
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Aggregation expectations."""
2+
3+
__all__ = []

0 commit comments

Comments
 (0)