Skip to content

Commit 42a233a

Browse files
committed
feat: restructure codebase with core/ module and explicit imports
1 parent 5567760 commit 42a233a

File tree

64 files changed

+337
-172
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+337
-172
lines changed

dataframe_expectations/__init__.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,5 @@
1-
from enum import Enum
2-
from typing import Union
1+
"""DataFrame Expectations - A validation library for pandas and PySpark DataFrames."""
32

4-
from pandas import DataFrame as PandasDataFrame
5-
from pyspark.sql import DataFrame as PySparkDataFrame
3+
__version__ = "0.3.0"
64

7-
DataFrameLike = Union[PySparkDataFrame, PandasDataFrame]
8-
9-
10-
class DataFrameType(str, Enum):
11-
"""
12-
Enum for DataFrame types.
13-
"""
14-
15-
PANDAS = "pandas"
16-
PYSPARK = "pyspark"
5+
__all__ = []
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Core base classes and interfaces for DataFrame expectations."""
2+
3+
__all__ = []

dataframe_expectations/expectations/aggregation_expectation.py renamed to dataframe_expectations/core/aggregation_expectation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from abc import abstractmethod
22
from typing import List, Union
33

4-
from dataframe_expectations import DataFrameLike, DataFrameType
5-
from dataframe_expectations.expectations import DataFrameExpectation
4+
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
5+
from dataframe_expectations.core.expectation import DataFrameExpectation
66
from dataframe_expectations.result_message import (
77
DataFrameExpectationFailureMessage,
88
DataFrameExpectationResultMessage,

dataframe_expectations/expectations/column_expectation.py renamed to dataframe_expectations/core/column_expectation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Callable
22

3-
from dataframe_expectations import DataFrameLike, DataFrameType
4-
from dataframe_expectations.expectations import DataFrameExpectation
3+
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
4+
from dataframe_expectations.core.expectation import DataFrameExpectation
55
from dataframe_expectations.result_message import (
66
DataFrameExpectationFailureMessage,
77
DataFrameExpectationResultMessage,
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from abc import ABC, abstractmethod
2+
from typing import cast
3+
4+
from pandas import DataFrame as PandasDataFrame
5+
from pyspark.sql import DataFrame as PySparkDataFrame
6+
7+
# Import the connect DataFrame type for Spark Connect
8+
try:
9+
from pyspark.sql.connect.dataframe import DataFrame as PySparkConnectDataFrame
10+
except ImportError:
11+
# Fallback for older PySpark versions that don't have connect
12+
PySparkConnectDataFrame = None # type: ignore[misc,assignment]
13+
14+
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
15+
from dataframe_expectations.result_message import (
16+
DataFrameExpectationResultMessage,
17+
)
18+
19+
20+
class DataFrameExpectation(ABC):
21+
"""
22+
Base class for DataFrame expectations.
23+
"""
24+
25+
def get_expectation_name(self) -> str:
26+
"""
27+
Returns the class name as the expectation name.
28+
"""
29+
return type(self).__name__
30+
31+
@abstractmethod
32+
def get_description(self) -> str:
33+
"""
34+
Returns a description of the expectation.
35+
"""
36+
raise NotImplementedError(
37+
f"description method must be implemented for {self.__class__.__name__}"
38+
)
39+
40+
def __str__(self):
41+
"""
42+
Returns a string representation of the expectation.
43+
"""
44+
return f"{self.get_expectation_name()} ({self.get_description()})"
45+
46+
@classmethod
47+
def infer_data_frame_type(cls, data_frame: DataFrameLike) -> DataFrameType:
48+
"""
49+
Infer the DataFrame type based on the provided DataFrame.
50+
"""
51+
if isinstance(data_frame, PandasDataFrame):
52+
return DataFrameType.PANDAS
53+
elif isinstance(data_frame, PySparkDataFrame):
54+
return DataFrameType.PYSPARK
55+
elif PySparkConnectDataFrame is not None and isinstance(
56+
data_frame, PySparkConnectDataFrame
57+
):
58+
return DataFrameType.PYSPARK
59+
else:
60+
raise ValueError(f"Unsupported DataFrame type: {type(data_frame)}")
61+
62+
def validate(self, data_frame: DataFrameLike, **kwargs):
63+
"""
64+
Validate the DataFrame against the expectation.
65+
"""
66+
data_frame_type = self.infer_data_frame_type(data_frame)
67+
68+
if data_frame_type == DataFrameType.PANDAS:
69+
return self.validate_pandas(data_frame=data_frame, **kwargs)
70+
elif data_frame_type == DataFrameType.PYSPARK:
71+
return self.validate_pyspark(data_frame=data_frame, **kwargs)
72+
else:
73+
raise ValueError(f"Unsupported DataFrame type: {data_frame_type}")
74+
75+
@abstractmethod
76+
def validate_pandas(
77+
self, data_frame: DataFrameLike, **kwargs
78+
) -> DataFrameExpectationResultMessage:
79+
"""
80+
Validate a pandas DataFrame against the expectation.
81+
"""
82+
raise NotImplementedError(
83+
f"validate_pandas method must be implemented for {self.__class__.__name__}"
84+
)
85+
86+
@abstractmethod
87+
def validate_pyspark(
88+
self, data_frame: DataFrameLike, **kwargs
89+
) -> DataFrameExpectationResultMessage:
90+
"""
91+
Validate a PySpark DataFrame against the expectation.
92+
"""
93+
raise NotImplementedError(
94+
f"validate_pyspark method must be implemented for {self.__class__.__name__}"
95+
)
96+
97+
@classmethod
98+
def num_data_frame_rows(cls, data_frame: DataFrameLike) -> int:
99+
"""
100+
Count the number of rows in the DataFrame.
101+
"""
102+
data_frame_type = cls.infer_data_frame_type(data_frame)
103+
if data_frame_type == DataFrameType.PANDAS:
104+
# Cast to PandasDataFrame since we know it's a Pandas DataFrame at this point
105+
return len(cast(PandasDataFrame, data_frame))
106+
elif data_frame_type == DataFrameType.PYSPARK:
107+
# Cast to PySparkDataFrame since we know it's a PySpark DataFrame at this point
108+
return cast(PySparkDataFrame, data_frame).count()
109+
else:
110+
raise ValueError(f"Unsupported DataFrame type: {data_frame_type}")
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Core types, enums, and data models for dataframe-expectations."""
2+
3+
from enum import Enum
4+
from typing import Any, Dict, Union
5+
6+
from pandas import DataFrame as PandasDataFrame
7+
from pydantic import BaseModel, ConfigDict, Field
8+
from pyspark.sql import DataFrame as PySparkDataFrame
9+
10+
# Type aliases
11+
DataFrameLike = Union[PySparkDataFrame, PandasDataFrame]
12+
13+
14+
class DataFrameType(str, Enum):
15+
"""Enum for DataFrame types."""
16+
17+
PANDAS = "pandas"
18+
PYSPARK = "pyspark"
19+
20+
21+
class ExpectationCategory(str, Enum):
22+
"""Categories for expectations."""
23+
24+
COLUMN_EXPECTATIONS = "Column Expectations"
25+
COLUMN_AGGREGATION_EXPECTATIONS = "Column Aggregation Expectations"
26+
DATAFRAME_AGGREGATION_EXPECTATIONS = "DataFrame Aggregation Expectations"
27+
28+
29+
class ExpectationSubcategory(str, Enum):
30+
"""Subcategory of expectations."""
31+
32+
ANY_VALUE = "Any Value"
33+
NUMERICAL = "Numerical"
34+
STRING = "String"
35+
UNIQUE = "Unique"
36+
37+
38+
class ExpectationMetadata(BaseModel):
39+
"""Metadata for a registered expectation."""
40+
41+
suite_method_name: str = Field(
42+
..., description="Method name in ExpectationsSuite (e.g., 'expect_value_greater_than')"
43+
)
44+
pydoc: str = Field(..., description="Human-readable description of the expectation")
45+
category: ExpectationCategory = Field(..., description="Category (e.g., 'Column Expectations')")
46+
subcategory: ExpectationSubcategory = Field(
47+
..., description="Subcategory (e.g., 'Numerical', 'String')"
48+
)
49+
params_doc: Dict[str, str] = Field(..., description="Documentation for each parameter")
50+
params: list = Field(default_factory=list, description="List of required parameter names")
51+
param_types: Dict[str, Any] = Field(
52+
default_factory=dict, description="Type hints for parameters"
53+
)
54+
factory_func_name: str = Field(..., description="Name of the factory function")
55+
expectation_name: str = Field(..., description="Name of the expectation class")
56+
57+
model_config = ConfigDict(frozen=True) # Make model immutable

dataframe_expectations/expectations/utils.py renamed to dataframe_expectations/core/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from functools import wraps
22
from typing import Any, Callable, Dict, Optional, Tuple, Type, Union, get_args
33

4-
from dataframe_expectations.expectations import DataFrameExpectation
4+
from dataframe_expectations.core.expectation import DataFrameExpectation
55

66

77
def requires_params(

dataframe_expectations/expectations/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# Fallback for older PySpark versions that don't have connect
1212
PySparkConnectDataFrame = None # type: ignore[misc,assignment]
1313

14-
from dataframe_expectations import DataFrameLike, DataFrameType
14+
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
1515
from dataframe_expectations.result_message import (
1616
DataFrameExpectationResultMessage,
1717
)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Aggregation expectations."""
2+
3+
__all__ = []

dataframe_expectations/expectations/aggregation_expectations/any_value_expectations.py renamed to dataframe_expectations/expectations/aggregation/any_value.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,16 @@
44
from pyspark.sql import DataFrame as PySparkDataFrame
55
from pyspark.sql import functions as F
66

7-
from dataframe_expectations import DataFrameLike, DataFrameType
8-
from dataframe_expectations.expectations.aggregation_expectation import (
7+
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
8+
from dataframe_expectations.core.aggregation_expectation import (
99
DataFrameAggregationExpectation,
1010
)
11-
from dataframe_expectations.expectations.expectation_registry import (
11+
from dataframe_expectations.registry import (
1212
ExpectationCategory,
1313
ExpectationSubcategory,
1414
register_expectation,
1515
)
16-
from dataframe_expectations.expectations.utils import requires_params
16+
from dataframe_expectations.core.utils import requires_params
1717
from dataframe_expectations.result_message import (
1818
DataFrameExpectationFailureMessage,
1919
DataFrameExpectationResultMessage,

0 commit comments

Comments
 (0)