mrpowers-io · kunaljubce · Mar 29, 2024 · Mar 29, 2024 · Mar 29, 2024 · Mar 29, 2024
diff --git a/README.md b/README.md
@@ -41,12 +41,28 @@ quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"])
 
 **validate_schema()**
 
-Raises an exception unless `source_df` contains all the `StructFields` defined in the `required_schema`.
+Raises an exception unless `source_df` contains all the `StructFields` defined in the `required_schema`. By default, `ignore_nullable` is set to False, so exception will be raised even if column names and data types are matching but nullability conditions are mismatching.
 
 ```python
-quinn.validate_schema(source_df, required_schema)
+quinn.validate_schema(required_schema, _df=source_df)
 ```
 
+You can also set `ignore_nullable` to True, so the validation will happen only on column names and data types, not on nullability. 
+
+```python
+quinn.validate_schema(required_schema, ignore_nullable=True, _df=source_df)
+```
+
+> [!TIP]
+> This function can also be used as a decorator to other functions that return a dataframe. This can help validate the schema of the returned df. When used as a decorator, you don't need to pass the `_df` argument as this validation is performed on the df returned by the base function on which the decorator is applied.
+> 
+> ```python
+> @quinn.validate_schema(required_schema, ignore_nullable=True)
+> def get_df():
+>   return df
+> ```
+
+
 **validate_absence_of_columns()**
 
 Raises an exception if `source_df` contains `age` or `cool` columns.

diff --git a/quinn/dataframe_validator.py b/quinn/dataframe_validator.py
@@ -1,7 +1,7 @@
-from __future__ import annotations
+from __future__ import annotations # noqa: I001
 
 import copy
-from typing import TYPE_CHECKING
+from typing import Any, Callable, TYPE_CHECKING
 
 if TYPE_CHECKING:
     from pyspark.sql import DataFrame
@@ -37,40 +37,58 @@ def validate_presence_of_columns(df: DataFrame, required_col_names: list[str]) -
     if missing_col_names:
         raise DataFrameMissingColumnError(error_message)
 
-
 def validate_schema(
-    df: DataFrame,
     required_schema: StructType,
     ignore_nullable: bool = False,
-) -> None:
+    _df: DataFrame = None,
+) -> Callable[[Any, Any], Any]:
     """Function that validate if a given DataFrame has a given StructType as its schema.
+    Implemented as a decorator factory so can be used both as a standalone function or as
+    a decorator to another function.
 
-    :param df: DataFrame to validate
-    :type df: DataFrame
     :param required_schema: StructType required for the DataFrame
     :type required_schema: StructType
     :param ignore_nullable: (Optional) A flag for if nullable fields should be
     ignored during validation
     :type ignore_nullable: bool, optional
+    :param _df: DataFrame to validate, mandatory when called as a function. Not required
+    when called as a decorator
+    :type _df: DataFrame
 
     :raises DataFrameMissingStructFieldError: if any StructFields from the required
     schema are not included in the DataFrame schema
     """
-    _all_struct_fields = copy.deepcopy(df.schema)
-    _required_schema = copy.deepcopy(required_schema)
 
-    if ignore_nullable:
-        for x in _all_struct_fields:
-            x.nullable = None
+    def decorator(func: Callable[..., DataFrame]) -> Callable[..., DataFrame]:
+        def wrapper(*args: object, **kwargs: object) -> DataFrame:
+            dataframe = func(*args, **kwargs)
+            _all_struct_fields = copy.deepcopy(dataframe.schema)
+            _required_schema = copy.deepcopy(required_schema)
+
+            if ignore_nullable:
+                for x in _all_struct_fields:
+                    x.nullable = None
+
+                for x in _required_schema:
+                    x.nullable = None
+
+            missing_struct_fields = [x for x in _required_schema if x not in _all_struct_fields]
+            error_message = f"The {missing_struct_fields} StructFields are not included in the DataFrame with the following StructFields {_all_struct_fields}" # noqa: E501
+
+            if missing_struct_fields:
+                raise DataFrameMissingStructFieldError(error_message)
+
+            print("Success! DataFrame matches the required schema!")
 
-        for x in _required_schema:
-            x.nullable = None
+            return dataframe
+        return wrapper
 
-    missing_struct_fields = [x for x in _required_schema if x not in _all_struct_fields]
-    error_message = f"The {missing_struct_fields} StructFields are not included in the DataFrame with the following StructFields {_all_struct_fields}"
+    if _df is None:
+        # This means the function is being used as a decorator
+        return decorator
 
-    if missing_struct_fields:
-        raise DataFrameMissingStructFieldError(error_message)
+    # This means the function is being called directly with a DataFrame
+    return decorator(lambda: _df)()
 
 
 def validate_absence_of_columns(df: DataFrame, prohibited_col_names: list[str]) -> None:

diff --git a/tests/test_dataframe_validator.py b/tests/test_dataframe_validator.py
@@ -34,7 +34,7 @@ def it_raises_when_struct_field_is_missing1():
             ]
         )
         with pytest.raises(quinn.DataFrameMissingStructFieldError) as excinfo:
-            quinn.validate_schema(source_df, required_schema)
+            quinn.validate_schema(required_schema, _df=source_df)
 
         current_spark_version = semver.Version.parse(spark.version)
         spark_330 = semver.Version.parse("3.3.0")
@@ -53,7 +53,7 @@ def it_does_nothing_when_the_schema_matches():
                 StructField("age", LongType(), True),
             ]
         )
-        quinn.validate_schema(source_df, required_schema)
+        quinn.validate_schema(required_schema, _df=source_df)
 
     def nullable_column_mismatches_are_ignored():
         data = [("jose", 1), ("li", 2), ("luisa", 3)]
@@ -64,7 +64,7 @@ def nullable_column_mismatches_are_ignored():
                 StructField("age", LongType(), False),
             ]
         )
-        quinn.validate_schema(source_df, required_schema, ignore_nullable=True)
+        quinn.validate_schema(required_schema, ignore_nullable=True, _df=source_df)
 
 
 def describe_validate_absence_of_columns():