@@ -21,6 +21,14 @@ standardized solution for this use case. As a result, any contributions made her
2121pip install dataframe-expectations
2222```
2323
24+ ### Requirements
25+
26+ * Python 3.10+
27+ * pandas >= 1.5.0
28+ * pydantic >= 2.12.4
29+ * pyspark >= 3.3.0
30+ * tabulate >= 0.8.9
31+
2432### Development setup
2533
2634To set up the development environment:
@@ -45,67 +53,134 @@ uv run pytest tests/ --cov=dataframe_expectations
4553
4654### Using the library
4755
48- ** Pandas example :**
56+ ** Basic usage with Pandas :**
4957``` python
50- from dataframe_expectations.expectations_suite import DataFameExpectationsSuite
58+ from dataframe_expectations.expectations_suite import DataFrameExpectationsSuite
59+ import pandas as pd
5160
61+ # Build a suite with expectations
5262suite = (
5363 DataFrameExpectationsSuite()
54- .expect_value_greater_than(" age" , 18 )
55- .expect_value_less_than(" age" , 10 )
64+ .expect_min_rows(min_rows = 3 )
65+ .expect_max_rows(max_rows = 10 )
66+ .expect_value_greater_than(column_name = " age" , value = 18 )
67+ .expect_value_less_than(column_name = " salary" , value = 100000 )
68+ .expect_value_not_null(column_name = " name" )
5669)
5770
58- # Create a Pandas DataFrame
59- import pandas as pd
60- test_pandas_df = pd.DataFrame({" age" : [20 , 15 , 30 ], " name" : [" Alice" , " Bob" , " Charlie" ]})
61-
62- suite.run(test_pandas_df)
71+ # Create a runner
72+ runner = suite.build()
6373
74+ # Validate a DataFrame
75+ df = pd.DataFrame({
76+ " age" : [25 , 15 , 45 , 22 ],
77+ " name" : [" Alice" , " Bob" , " Charlie" , " Diana" ],
78+ " salary" : [50000 , 60000 , 80000 , 45000 ]
79+ })
80+ runner.run(df)
6481```
6582
66-
6783** PySpark example:**
6884``` python
6985from dataframe_expectations.expectations_suite import DataFrameExpectationsSuite
86+ from pyspark.sql import SparkSession
7087
88+ # Initialize Spark session
89+ spark = SparkSession.builder.appName(" example" ).getOrCreate()
90+
91+ # Build a validation suite (same API as Pandas!)
7192suite = (
7293 DataFrameExpectationsSuite()
73- .expect_value_greater_than(" age" , 18 )
74- .expect_value_less_than(" age" , 40 )
94+ .expect_min_rows(min_rows = 3 )
95+ .expect_max_rows(max_rows = 10 )
96+ .expect_value_greater_than(column_name = " age" , value = 18 )
97+ .expect_value_less_than(column_name = " salary" , value = 100000 )
98+ .expect_value_not_null(column_name = " name" )
7599)
76100
101+ # Build the runner
102+ runner = suite.build()
103+
77104# Create a PySpark DataFrame
78- test_spark_df = spark.createDataFrame(
79- [
80- {" name" : " Alice" , " age" : 20 },
81- {" name" : " Bob" , " age" : 15 },
82- {" name" : " Charlie" , " age" : 30 },
83- ]
84- )
105+ data = [
106+ {" age" : 25 , " name" : " Alice" , " salary" : 50000 },
107+ {" age" : 15 , " name" : " Bob" , " salary" : 60000 },
108+ {" age" : 45 , " name" : " Charlie" , " salary" : 80000 },
109+ {" age" : 22 , " name" : " Diana" , " salary" : 45000 }
110+ ]
111+ df = spark.createDataFrame(data)
112+
113+ # Validate
114+ runner.run(df)
115+ ```
116+
117+ ** Decorator pattern for automatic validation:**
118+ ``` python
119+ from dataframe_expectations.expectations_suite import DataFrameExpectationsSuite
120+ from pyspark.sql import SparkSession
85121
86- suite.run(test_spark_df)
122+ # Initialize Spark session
123+ spark = SparkSession.builder.appName(" example" ).getOrCreate()
124+
125+ suite = (
126+ DataFrameExpectationsSuite()
127+ .expect_min_rows(min_rows = 3 )
128+ .expect_max_rows(max_rows = 10 )
129+ .expect_value_greater_than(column_name = " age" , value = 18 )
130+ .expect_value_less_than(column_name = " salary" , value = 100000 )
131+ .expect_value_not_null(column_name = " name" )
132+ )
87133
134+ # Build the runner
135+ runner = suite.build()
136+
137+ # Apply decorator to automatically validate function output
138+ @runner.validate
139+ def load_employee_data ():
140+ """ Load and return employee data - automatically validated."""
141+ return spark.createDataFrame(
142+ [
143+ {" age" : 25 , " name" : " Alice" , " salary" : 50000 },
144+ {" age" : 15 , " name" : " Bob" , " salary" : 60000 },
145+ {" age" : 45 , " name" : " Charlie" , " salary" : 80000 },
146+ {" age" : 22 , " name" : " Diana" , " salary" : 45000 }
147+ ]
148+ )
149+
150+ # Function execution automatically validates the returned DataFrame
151+ df = load_employee_data() # Raises DataFrameExpectationsSuiteFailure if validation fails
152+
153+ # Allow functions that may return None
154+ @runner.validate (allow_none = True )
155+ def conditional_load (should_load : bool ):
156+ """ Conditionally load data - validation only runs when DataFrame is returned."""
157+ if should_load:
158+ return spark.createDataFrame([{" age" : 25 , " name" : " Alice" , " salary" : 50000 }])
159+ return None # No validation when None is returned
88160```
89161
90162** Output:**
91163``` python
92164========================== Running expectations suite ==========================
93- ExpectationValueGreaterThan (' age' greater than 18 ) ... FAIL
94- ExpectationValueLessThan (' age' less than 40 ) ... OK
95- ============================ 1 success, 1 failures ============================ =
165+ ExpectationMinRows (DataFrame contains at least 3 rows) ... OK
166+ ExpectationMaxRows (DataFrame contains at most 10 rows) ... OK
167+ ExpectationValueGreaterThan (' age' is greater than 18 ) ... FAIL
168+ ExpectationValueLessThan (' salary' is less than 100000 ) ... OK
169+ ExpectationValueNotNull (' name' is not null) ... OK
170+ ============================ 4 success, 1 failures ============================ =
96171
97- ExpectationSuiteFailure: (1 / 2 ) expectations failed.
172+ ExpectationSuiteFailure: (1 / 5 ) expectations failed.
98173
99174================================================================================
100175List of violations:
101176--------------------------------------------------------------------------------
102- [Failed 1 / 1 ] ExpectationValueGreaterThan (' age' greater than 18 ): Found 1 row(s) where ' age' is not greater than 18 .
177+ [Failed 1 / 1 ] ExpectationValueGreaterThan (' age' is greater than 18 ): Found 1 row(s) where ' age' is not greater than 18 .
103178Some examples of violations:
104- + ---- -+ ------ +
105- | age | name |
106- + ---- -+ ------ +
107- | 15 | Bob |
108- + ---- -+ ------ +
179+ + ---- -+ ------ + -------- +
180+ | age | name | salary |
181+ + ---- -+ ------ + -------- +
182+ | 15 | Bob | 60000 |
183+ + ---- -+ ------ + -------- +
109184================================================================================
110185
111186```
0 commit comments