diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 0211d85..72386fa 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -33,7 +33,7 @@ jobs: uv run python scripts/sanity_checks.py - name: Run tests run: | - uv run pytest tests/ --cov=dataframe_expectations + uv run pytest tests/ -n auto --tb=line --cov=dataframe_expectations lint: runs-on: ubuntu-latest diff --git a/README.md b/README.md index c1ae71b..892e16b 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,44 @@ Some examples of violations: ``` +**Tag-based filtering for selective execution:** +```python +from dataframe_expectations import DataFrameExpectationsSuite, TagMatchMode + +# Tag expectations with priorities and environments +suite = ( + DataFrameExpectationsSuite() + .expect_value_greater_than(column_name="age", value=18, tags=["priority:high", "env:prod"]) + .expect_value_not_null(column_name="name", tags=["priority:high"]) + .expect_min_rows(min_rows=1, tags=["priority:low", "env:test"]) +) + +# Run only high-priority checks (OR logic - matches ANY tag) +runner = suite.build(tags=["priority:high"], tag_match_mode=TagMatchMode.ANY) +runner.run(df) + +# Run production-critical checks (AND logic - matches ALL tags) +runner = suite.build(tags=["priority:high", "env:prod"], tag_match_mode=TagMatchMode.ALL) +runner.run(df) +``` + +**Programmatic result inspection:** +```python +# Get detailed results without raising exceptions +result = runner.run(df, raise_on_failure=False) + +# Inspect validation outcomes +print(f"Total: {result.total_expectations}, Passed: {result.total_passed}, Failed: {result.total_failed}") +print(f"Pass rate: {result.pass_rate:.2%}") +print(f"Duration: {result.total_duration_seconds:.2f}s") +print(f"Applied filters: {result.applied_filters}") + +# Access individual results +for exp_result in result.results: + if exp_result.status == "failed": + print(f"Failed: {exp_result.description} - {exp_result.violation_count} violations") +``` + ### How to contribute? Contributions are welcome! You can enhance the library by adding new expectations, refining existing ones, or improving the testing framework. diff --git a/dataframe_expectations/__init__.py b/dataframe_expectations/__init__.py index 0711927..adfc5b7 100644 --- a/dataframe_expectations/__init__.py +++ b/dataframe_expectations/__init__.py @@ -9,4 +9,24 @@ # Catch all exceptions to handle various edge cases in different environments __version__ = "0.0.0.dev0" -__all__ = [] +from dataframe_expectations.core.suite_result import ( + ExpectationResult, + SuiteExecutionResult, + serialize_violations, +) +from dataframe_expectations.core.types import TagMatchMode +from dataframe_expectations.suite import ( + DataFrameExpectationsSuite, + DataFrameExpectationsSuiteRunner, + DataFrameExpectationsSuiteFailure, +) + +__all__ = [ + "ExpectationResult", + "SuiteExecutionResult", + "serialize_violations", + "DataFrameExpectationsSuite", + "DataFrameExpectationsSuiteRunner", + "DataFrameExpectationsSuiteFailure", + "TagMatchMode", +] diff --git a/dataframe_expectations/core/__init__.py b/dataframe_expectations/core/__init__.py index 436fc0d..4526337 100644 --- a/dataframe_expectations/core/__init__.py +++ b/dataframe_expectations/core/__init__.py @@ -1,3 +1,15 @@ """Core base classes and interfaces for DataFrame expectations.""" -__all__ = [] +from dataframe_expectations.core.suite_result import ( + ExpectationResult, + ExpectationStatus, + SuiteExecutionResult, + serialize_violations, +) + +__all__ = [ + "ExpectationResult", + "ExpectationStatus", + "SuiteExecutionResult", + "serialize_violations", +] diff --git a/dataframe_expectations/core/aggregation_expectation.py b/dataframe_expectations/core/aggregation_expectation.py index 351a8e4..8979f13 100644 --- a/dataframe_expectations/core/aggregation_expectation.py +++ b/dataframe_expectations/core/aggregation_expectation.py @@ -1,5 +1,5 @@ from abc import abstractmethod -from typing import List, Union +from typing import List, Optional, Union from dataframe_expectations.core.types import DataFrameLike, DataFrameType from dataframe_expectations.core.expectation import DataFrameExpectation @@ -20,6 +20,7 @@ def __init__( expectation_name: str, column_names: List[str], description: str, + tags: Optional[List[str]] = None, ): """ Template for implementing DataFrame aggregation expectations, where data is first aggregated @@ -28,7 +29,10 @@ def __init__( :param expectation_name: The name of the expectation. This will be used during logging. :param column_names: The list of column names to aggregate on. :param description: A description of the expectation used in logging. + :param tags: Optional tags as list of strings in "key:value" format. + Example: ["priority:high", "env:test"] """ + super().__init__(tags=tags) self.expectation_name = expectation_name self.column_names = column_names self.description = description diff --git a/dataframe_expectations/core/column_expectation.py b/dataframe_expectations/core/column_expectation.py index 392f82a..e7a4262 100644 --- a/dataframe_expectations/core/column_expectation.py +++ b/dataframe_expectations/core/column_expectation.py @@ -1,4 +1,4 @@ -from typing import Callable +from typing import Callable, List, Optional from dataframe_expectations.core.types import DataFrameLike, DataFrameType from dataframe_expectations.core.expectation import DataFrameExpectation @@ -23,6 +23,7 @@ def __init__( fn_violations_pyspark: Callable, description: str, error_message: str, + tags: Optional[List[str]] = None, ): """ Template for implementing DataFrame column expectations, where a column value is tested against a @@ -34,7 +35,10 @@ def __init__( :param fn_violations_pyspark: Function to find violations in a PySpark DataFrame. :param description: A description of the expectation used in logging. :param error_message: The error message to return if the expectation fails. + :param tags: Optional tags as list of strings in "key:value" format. + Example: ["priority:high", "env:test"] """ + super().__init__(tags=tags) self.column_name = column_name self.expectation_name = expectation_name self.fn_violations_pandas = fn_violations_pandas diff --git a/dataframe_expectations/core/expectation.py b/dataframe_expectations/core/expectation.py index c677b32..007a72f 100644 --- a/dataframe_expectations/core/expectation.py +++ b/dataframe_expectations/core/expectation.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import cast +from typing import List, Optional, cast from pandas import DataFrame as PandasDataFrame from pyspark.sql import DataFrame as PySparkDataFrame @@ -12,6 +12,7 @@ PySparkConnectDataFrame = None # type: ignore[misc,assignment] from dataframe_expectations.core.types import DataFrameLike, DataFrameType +from dataframe_expectations.core.tagging import TagSet from dataframe_expectations.result_message import ( DataFrameExpectationResultMessage, ) @@ -22,6 +23,20 @@ class DataFrameExpectation(ABC): Base class for DataFrame expectations. """ + def __init__(self, tags: Optional[List[str]] = None): + """ + Initialize the base expectation with optional tags. + :param tags: Optional tags as list of strings in "key:value" format. + Example: ["priority:high", "env:test"] + """ + self.__tags = TagSet(tags) + + def get_tags(self) -> TagSet: + """ + Returns the tags for this expectation. + """ + return self.__tags + def get_expectation_name(self) -> str: """ Returns the class name as the expectation name. @@ -48,16 +63,17 @@ def infer_data_frame_type(cls, data_frame: DataFrameLike) -> DataFrameType: """ Infer the DataFrame type based on the provided DataFrame. """ - if isinstance(data_frame, PandasDataFrame): - return DataFrameType.PANDAS - elif isinstance(data_frame, PySparkDataFrame): - return DataFrameType.PYSPARK - elif PySparkConnectDataFrame is not None and isinstance( - data_frame, PySparkConnectDataFrame - ): - return DataFrameType.PYSPARK - else: - raise ValueError(f"Unsupported DataFrame type: {type(data_frame)}") + match data_frame: + case PandasDataFrame(): + return DataFrameType.PANDAS + case PySparkDataFrame(): + return DataFrameType.PYSPARK + case _ if PySparkConnectDataFrame is not None and isinstance( + data_frame, PySparkConnectDataFrame + ): + return DataFrameType.PYSPARK + case _: + raise ValueError(f"Unsupported DataFrame type: {type(data_frame)}") def validate(self, data_frame: DataFrameLike, **kwargs): """ @@ -65,12 +81,13 @@ def validate(self, data_frame: DataFrameLike, **kwargs): """ data_frame_type = self.infer_data_frame_type(data_frame) - if data_frame_type == DataFrameType.PANDAS: - return self.validate_pandas(data_frame=data_frame, **kwargs) - elif data_frame_type == DataFrameType.PYSPARK: - return self.validate_pyspark(data_frame=data_frame, **kwargs) - else: - raise ValueError(f"Unsupported DataFrame type: {data_frame_type}") + match data_frame_type: + case DataFrameType.PANDAS: + return self.validate_pandas(data_frame=data_frame, **kwargs) + case DataFrameType.PYSPARK: + return self.validate_pyspark(data_frame=data_frame, **kwargs) + case _: + raise ValueError(f"Unsupported DataFrame type: {data_frame_type}") @abstractmethod def validate_pandas( diff --git a/dataframe_expectations/core/suite_result.py b/dataframe_expectations/core/suite_result.py new file mode 100644 index 0000000..6a54c96 --- /dev/null +++ b/dataframe_expectations/core/suite_result.py @@ -0,0 +1,178 @@ +"""Suite execution result models for capturing validation outcomes.""" + +from datetime import datetime +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field, computed_field + +from dataframe_expectations.core.types import DataFrameType, DataFrameLike, TagMatchMode +from dataframe_expectations.core.tagging import TagSet +import logging + +from enum import Enum + +logger = logging.getLogger(__name__) + + +class ExpectationStatus(str, Enum): + PASSED = "passed" + FAILED = "failed" + SKIPPED = "skipped" + + +class ExpectationResult(BaseModel): + """ + Representation of a single expectation result within a suite execution. + Captures the outcome (passed, failed, skipped) using status. + Does not store raw dataframes, only serialized violation samples. + """ + + expectation_name: str = Field(..., description="Name of the expectation class") + description: str = Field(..., description="Human-readable description of the expectation") + status: ExpectationStatus = Field(..., description="Outcome status: passed, failed, or skipped") + tags: Optional[TagSet] = Field( + default=None, description="User-defined tags for this specific expectation" + ) + error_message: Optional[str] = Field( + default=None, description="Error message if expectation failed" + ) + violation_count: Optional[int] = Field( + default=None, description="Total count of violations (if applicable)" + ) + violation_sample: Optional[List[Dict[str, Any]]] = Field( + default=None, + description="Sample of violations as list of dicts (limited by violation_sample_limit)", + ) + + model_config = {"frozen": True} # Make immutable + + +class SuiteExecutionResult(BaseModel): + """Result of a complete suite execution. + Captures all metadata about the suite run including timing, dataframe info, + and individual expectation results. Does not store raw dataframes. + """ + + suite_name: Optional[str] = Field(default=None, description="Optional name for the suite") + context: Dict[str, Any] = Field( + default_factory=dict, description="Additional runtime metadata (e.g., job_id, environment)" + ) + applied_filters: TagSet = Field( + default_factory=TagSet, description="Tag filters that were applied to select expectations" + ) + tag_match_mode: Optional[TagMatchMode] = Field( + default=None, + description="How tags were matched: TagMatchMode.ANY (OR) or TagMatchMode.ALL (AND)", + ) + results: List[ExpectationResult] = Field( + ..., description="Results for each expectation in execution order (including skipped)" + ) + start_time: datetime = Field(..., description="Suite execution start timestamp") + end_time: datetime = Field(..., description="Suite execution end timestamp") + dataframe_type: DataFrameType = Field(..., description="Type of dataframe validated") + dataframe_row_count: int = Field(..., description="Number of rows in validated dataframe") + dataframe_was_cached: bool = Field( + default=False, description="Whether PySpark dataframe was cached during execution" + ) + + model_config = {"frozen": True} # Make immutable + + @computed_field # type: ignore[misc] + @property + def total_duration_seconds(self) -> float: + """Total execution time in seconds.""" + return (self.end_time - self.start_time).total_seconds() + + @computed_field # type: ignore[misc] + @property + def total_expectations(self) -> int: + """Total number of expectations in the suite (including skipped).""" + return len(self.results) + + @computed_field # type: ignore[misc] + @property + def total_passed(self) -> int: + """Number of expectations that passed.""" + return sum(1 for r in self.results if r.status == ExpectationStatus.PASSED) + + @computed_field # type: ignore[misc] + @property + def total_failed(self) -> int: + """Number of expectations that failed.""" + return sum(1 for r in self.results if r.status == ExpectationStatus.FAILED) + + @computed_field # type: ignore[misc] + @property + def total_skipped(self) -> int: + """Number of expectations that were skipped due to tag filtering.""" + return sum(1 for r in self.results if r.status == ExpectationStatus.SKIPPED) + + @computed_field # type: ignore[misc] + @property + def pass_rate(self) -> float: + """Percentage of expectations that passed (0.0 to 1.0).""" + executed = self.total_passed + self.total_failed + if executed == 0: + return 1.0 + return self.total_passed / executed + + @computed_field # type: ignore[misc] + @property + def success(self) -> bool: + """Whether all executed expectations passed (ignores skipped).""" + return self.total_failed == 0 + + @computed_field # type: ignore[misc] + @property + def passed_expectations(self) -> List[ExpectationResult]: + """List of expectations that passed.""" + return [r for r in self.results if r.status == ExpectationStatus.PASSED] + + @computed_field # type: ignore[misc] + @property + def failed_expectations(self) -> List[ExpectationResult]: + """List of expectations that failed.""" + return [r for r in self.results if r.status == ExpectationStatus.FAILED] + + @computed_field # type: ignore[misc] + @property + def skipped_expectations(self) -> List[ExpectationResult]: + """List of expectations that were skipped due to tag filtering.""" + return [r for r in self.results if r.status == ExpectationStatus.SKIPPED] + + +def serialize_violations( + violations_df: Optional[DataFrameLike], + df_type: DataFrameType, + limit: int = 5, +) -> tuple[Optional[int], Optional[List[Dict[str, Any]]]]: + """Serialize violation dataframe to count and sample for storage. + + Converts dataframes to JSON-serializable format without storing raw dataframes. + + :param violations_df: DataFrame containing violations (pandas or PySpark). + :param df_type: Type of the violations dataframe. + :param limit: Maximum number of violation rows to include in sample. + :return: Tuple of (total_count, sample_as_list_of_dicts). + """ + if violations_df is None: + return None, None + + count: Optional[int] = None + sample: Optional[list[dict[str, Any]]] = None + + try: + if df_type == DataFrameType.PANDAS: + pandas_df = violations_df # type: ignore[assignment] + count = len(pandas_df) # type: ignore[arg-type] + sample = pandas_df.head(limit).to_dict("records") # type: ignore[assignment,union-attr] + elif df_type == DataFrameType.PYSPARK: + pyspark_df = violations_df # type: ignore[assignment] + count = pyspark_df.count() # type: ignore[assignment] + sample = pyspark_df.limit(limit).toPandas().to_dict("records") # type: ignore[assignment,operator] + + return count, sample + except Exception: + # If serialization fails, return None to avoid breaking the suite + logger.warning("Failed to serialize violations dataframe", exc_info=True) + return None, None diff --git a/dataframe_expectations/core/tagging.py b/dataframe_expectations/core/tagging.py new file mode 100644 index 0000000..af93d42 --- /dev/null +++ b/dataframe_expectations/core/tagging.py @@ -0,0 +1,165 @@ +"""Tagging system for filtering and organizing expectations. + +Simple tag-based filtering using "key:value" format strings. +Tags are stored internally as Dict[key, Set[values]] for efficient matching. +""" + +from __future__ import annotations + +from typing import Dict, List, Optional, Set + +from pydantic import BaseModel, ConfigDict + + +class TagSet(BaseModel): + """ + Collection of tags organized by key, supporting multiple values per key. + + Internal structure: Dict[key, Set[values]] + Example: {"priority": {"high", "medium"}, "env": {"test", "prod"}} + + Tags are specified as strings in "key:value" format. + """ + + tags: Dict[str, Set[str]] = {} + + model_config = ConfigDict(frozen=True) # Make immutable + + def __init__(self, tags: Optional[List[str]] = None, **data): + """ + Initialize TagSet from a list of tag strings. + + :param tags: List of tag strings in "key:value" format + Example: ["priority:high", "env:test", "priority:medium"] + + Examples: + >>> TagSet(["priority:high", "env:test"]) + >>> TagSet(["priority:high", "priority:medium"]) # Multiple values for same key + """ + # Parse tags if provided as list + if tags is not None: + parsed_tags: Dict[str, Set[str]] = {} + for tag_string in tags: + TagSet._parse_and_add_tag(tag_string, parsed_tags) + data["tags"] = parsed_tags + + super().__init__(**data) + + @staticmethod + def _parse_and_add_tag(tag_string: str, tags_dict: Dict[str, Set[str]]) -> None: + """ + Parse and add a tag string to the provided dictionary. + + :param tag_string: Tag string to parse + :param tags_dict: Dictionary to add parsed tag to + :raises ValueError: If format is invalid + """ + tag_string = tag_string.strip() + + if ":" not in tag_string: + raise ValueError(f"Invalid tag format '{tag_string}'. Expected 'key:value' format.") + + parts = tag_string.split(":", 1) + if len(parts) != 2: + raise ValueError( + f"Invalid tag format '{tag_string}'. Expected exactly one ':' separator." + ) + + key, value = parts[0].strip(), parts[1].strip() + + if not key or not value: + raise ValueError("Tag key and value must be non-empty strings") + + if key not in tags_dict: + tags_dict[key] = set() + tags_dict[key].add(value) + + def has_any_tag_from(self, other: TagSet) -> bool: + """ + Check if this TagSet has ANY tag from the other TagSet (OR logic). + + For each key in 'other', checks if there's any overlap in values. + Returns True if ANY key has any overlapping values. + + :param other: TagSet to match against + :return: True if any tag matches + + Examples: + self = TagSet(["priority:high", "env:test"]) + other = TagSet(["priority:high"]) + self.has_any_tag_from(other) -> True (priority:high matches) + + other = TagSet(["priority:medium"]) + self.has_any_tag_from(other) -> False + + other = TagSet(["priority:medium", "env:test"]) + self.has_any_tag_from(other) -> True (env:test matches) + """ + if not other.tags: + return True # Empty filter matches everything + + # OR logic: any key with overlapping values + for key, required_values in other.tags.items(): + if key in self.tags: + # Check if there's any overlap between required values and our values + if required_values & self.tags[key]: + return True + + return False + + def has_all_tags_from(self, other: TagSet) -> bool: + """ + Check if this TagSet has ALL tags from the other TagSet (AND logic). + + For each key in 'other', checks if there's any overlap in values. + Returns True only if ALL keys from other have overlapping values. + + :param other: TagSet to match against + :return: True if all tags match + + Examples: + self = TagSet(["priority:high", "env:test", 'role:admin']) + other = TagSet(["priority:high", "env:test"]) + self.has_all_tags_from(other) -> True (both match) + + other = TagSet(["priority:high"]) + self.has_all_tags_from(other) -> True (priority:high matches) + + other = TagSet(["priority:high", "env:prod"]) + self.has_all_tags_from(other) -> False (env:prod doesn't match) + """ + if not other.tags: + return True # Empty filter matches everything + + # AND logic: all keys must have ALL required values present + for key, required_values in other.tags.items(): + if key not in self.tags: + return False + # Check if ALL required values are present in our values + if not required_values.issubset(self.tags[key]): + return False + + return True + + def is_empty(self) -> bool: + """Check if TagSet has no tags.""" + return len(self.tags) == 0 + + def __len__(self) -> int: + """Return total number of unique tags (key:value pairs).""" + return sum(len(values) for values in self.tags.values()) + + def __bool__(self) -> bool: + """Return True if TagSet has any tags.""" + return bool(self.tags) + + def __str__(self) -> str: + """String representation showing all tags.""" + tag_list = [] + for key in sorted(self.tags.keys()): + for value in sorted(self.tags[key]): + tag_list.append(f"{key}:{value}") + return f"TagSet({', '.join(tag_list)})" if tag_list else "TagSet(empty)" + + def __repr__(self) -> str: + return self.__str__() diff --git a/dataframe_expectations/core/types.py b/dataframe_expectations/core/types.py index e681994..112daa1 100644 --- a/dataframe_expectations/core/types.py +++ b/dataframe_expectations/core/types.py @@ -18,6 +18,13 @@ class DataFrameType(str, Enum): PYSPARK = "pyspark" +class TagMatchMode(str, Enum): + """Enum for tag matching modes.""" + + ANY = "any" # OR logic: expectation matches if it has ANY of the filter tags + ALL = "all" # AND logic: expectation matches if it has ALL of the filter tags + + class ExpectationCategory(str, Enum): """Categories for expectations.""" diff --git a/dataframe_expectations/expectations/aggregation/any_value.py b/dataframe_expectations/expectations/aggregation/any_value.py index c59111e..a6f5adc 100644 --- a/dataframe_expectations/expectations/aggregation/any_value.py +++ b/dataframe_expectations/expectations/aggregation/any_value.py @@ -1,4 +1,4 @@ -from typing import cast +from typing import List, Optional, cast from pandas import DataFrame as PandasDataFrame from pyspark.sql import DataFrame as PySparkDataFrame @@ -35,23 +35,23 @@ class ExpectationMinRows(DataFrameAggregationExpectation): - ExpectationMinRows(min_rows=150) → FAIL """ - def __init__(self, min_rows: int): + def __init__(self, min_rows: int, tags: Optional[List[str]] = None): """ Initialize the minimum rows expectation. :param min_rows: Minimum number of rows required (inclusive). + :param tags: Optional key-value tags for this expectation. """ if min_rows < 0: raise ValueError(f"min_rows must be non-negative, got {min_rows}") - description = f"DataFrame contains at least {min_rows} rows" - self.min_rows = min_rows super().__init__( expectation_name="ExpectationMinRows", column_names=[], # No specific columns required - description=description, + description=f"DataFrame contains at least {min_rows} rows", + tags=tags, ) def aggregate_and_validate_pandas( @@ -120,23 +120,23 @@ class ExpectationMaxRows(DataFrameAggregationExpectation): - ExpectationMaxRows(max_rows=50) → FAIL """ - def __init__(self, max_rows: int): + def __init__(self, max_rows: int, tags: Optional[List[str]] = None): """ Initialize the maximum rows expectation. :param max_rows: Maximum number of rows allowed (inclusive). + :param tags: Optional key-value tags for this expectation. """ if max_rows < 0: raise ValueError(f"max_rows must be non-negative, got {max_rows}") - description = f"DataFrame contains at most {max_rows} rows" - self.max_rows = max_rows super().__init__( expectation_name="ExpectationMaxRows", column_names=[], # No specific columns required - description=description, + description=f"DataFrame contains at most {max_rows} rows", + tags=tags, ) def aggregate_and_validate_pandas( @@ -209,25 +209,25 @@ class ExpectationMaxNullPercentage(DataFrameAggregationExpectation): Note: The percentage is expressed as a value between 0.0 and 100.0 (e.g., 5.5 for 5.5%). """ - def __init__(self, column_name: str, max_percentage: float): + def __init__(self, column_name: str, max_percentage: float, tags: Optional[List[str]] = None): """ Initialize the maximum null percentage expectation. :param column_name: Name of the column to check for null percentage. :param max_percentage: Maximum percentage of null values allowed (0.0-100.0). + :param tags: Optional key-value tags for this expectation. """ if not 0 <= max_percentage <= 100: raise ValueError(f"max_percentage must be between 0.0 and 100.0, got {max_percentage}") - description = f"column '{column_name}' null percentage is at most {max_percentage}%" - self.column_name = column_name self.max_percentage = max_percentage super().__init__( expectation_name="ExpectationMaxNullPercentage", column_names=[column_name], # Specify the required column - description=description, + description=f"column '{column_name}' null percentage is at most {max_percentage}%", + tags=tags, ) def aggregate_and_validate_pandas( @@ -326,25 +326,25 @@ class ExpectationMaxNullCount(DataFrameAggregationExpectation): Note: The count is the absolute number of null values, not a percentage. """ - def __init__(self, column_name: str, max_count: int): + def __init__(self, column_name: str, max_count: int, tags: Optional[List[str]] = None): """ Initialize the maximum null count expectation. :param column_name: Name of the column to check for null count. :param max_count: Maximum number of null values allowed. + :param tags: Optional key-value tags for this expectation. """ if max_count < 0: raise ValueError(f"max_count must be non-negative, got {max_count}") - description = f"column '{column_name}' has at most {max_count} null values" - self.column_name = column_name self.max_count = max_count super().__init__( expectation_name="ExpectationMaxNullCount", column_names=[column_name], # Specify the required column - description=description, + description=f"column '{column_name}' has at most {max_count} null values", + tags=tags, ) def aggregate_and_validate_pandas( @@ -423,14 +423,17 @@ def aggregate_and_validate_pyspark( }, ) @requires_params("min_rows", types={"min_rows": int}) -def create_expectation_min_rows(min_rows: int) -> ExpectationMinRows: +def create_expectation_min_rows( + min_rows: int, tags: Optional[List[str]] = None +) -> ExpectationMinRows: """ Create an ExpectMinRows instance. :param min_rows: Minimum number of rows required. + :param tags: Optional tags as list of strings in "key:value" format. :return: A configured expectation instance. """ - return ExpectationMinRows(min_rows=min_rows) + return ExpectationMinRows(min_rows=min_rows, tags=tags) @register_expectation( @@ -443,14 +446,17 @@ def create_expectation_min_rows(min_rows: int) -> ExpectationMinRows: }, ) @requires_params("max_rows", types={"max_rows": int}) -def create_expectation_max_rows(max_rows: int) -> ExpectationMaxRows: +def create_expectation_max_rows( + max_rows: int, tags: Optional[List[str]] = None +) -> ExpectationMaxRows: """ Create an ExpectationMaxRows instance. :param max_rows: Maximum number of rows allowed. + :param tags: Optional tags as list of strings in "key:value" format. :return: A configured expectation instance. """ - return ExpectationMaxRows(max_rows=max_rows) + return ExpectationMaxRows(max_rows=max_rows, tags=tags) @register_expectation( @@ -469,18 +475,20 @@ def create_expectation_max_rows(max_rows: int) -> ExpectationMaxRows: types={"column_name": str, "max_percentage": (int, float)}, ) def create_expectation_max_null_percentage( - column_name: str, max_percentage: float + column_name: str, max_percentage: float, tags: Optional[List[str]] = None ) -> ExpectationMaxNullPercentage: """ Create an ExpectationMaxNullPercentage instance. :param column_name: Name of the column to check for null percentage. :param max_percentage: Maximum percentage of null values allowed (0.0-100.0). + :param tags: Optional tags as list of strings in "key:value" format. :return: A configured expectation instance. """ return ExpectationMaxNullPercentage( column_name=column_name, max_percentage=max_percentage, + tags=tags, ) @@ -499,15 +507,19 @@ def create_expectation_max_null_percentage( "max_count", types={"column_name": str, "max_count": int}, ) -def create_expectation_max_null_count(column_name: str, max_count: int) -> ExpectationMaxNullCount: +def create_expectation_max_null_count( + column_name: str, max_count: int, tags: Optional[List[str]] = None +) -> ExpectationMaxNullCount: """ Create an ExpectationMaxNullCount instance. :param column_name: Name of the column to check for null count. :param max_count: Maximum number of null values allowed. + :param tags: Optional tags as list of strings in "key:value" format. :return: A configured expectation instance. """ return ExpectationMaxNullCount( column_name=column_name, max_count=max_count, + tags=tags, ) diff --git a/dataframe_expectations/expectations/aggregation/numerical.py b/dataframe_expectations/expectations/aggregation/numerical.py index 2d05a02..7fd7101 100644 --- a/dataframe_expectations/expectations/aggregation/numerical.py +++ b/dataframe_expectations/expectations/aggregation/numerical.py @@ -1,4 +1,4 @@ -from typing import Union, cast +from typing import List, Optional, Union, cast import pandas as pd from pandas import DataFrame as PandasDataFrame @@ -50,6 +50,7 @@ def __init__( quantile: float, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ): """ Initialize the column quantile between expectation. @@ -86,6 +87,7 @@ def __init__( expectation_name="ExpectationColumnQuantileBetween", column_names=[column_name], description=description, + tags=tags, ) def aggregate_and_validate_pandas( @@ -220,6 +222,7 @@ def __init__( column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ): """ Initialize the column mean between expectation. @@ -238,6 +241,7 @@ def __init__( expectation_name="ExpectationColumnMeanBetween", column_names=[column_name], description=description, + tags=tags, ) def aggregate_and_validate_pandas( @@ -346,6 +350,7 @@ def create_expectation_column_quantile_between( quantile: float, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> ExpectationColumnQuantileBetween: """ Create an ExpectationColumnQuantileBetween instance. @@ -354,6 +359,7 @@ def create_expectation_column_quantile_between( :param quantile: Quantile to compute (0.0 to 1.0). :param min_value: Minimum allowed value for the column quantile. :param max_value: Maximum allowed value for the column quantile. + :param tags: Optional tags as list of strings in "key:value" format. :return: A configured expectation instance. """ return ExpectationColumnQuantileBetween( @@ -361,6 +367,7 @@ def create_expectation_column_quantile_between( quantile=quantile, min_value=min_value, max_value=max_value, + tags=tags, ) @@ -386,6 +393,7 @@ def create_expectation_column_max_to_be_between( column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> ExpectationColumnQuantileBetween: """ Create an ExpectationColumnQuantileBetween instance for maximum values (quantile=1.0). @@ -393,6 +401,7 @@ def create_expectation_column_max_to_be_between( :param column_name: Name of the column to check. :param min_value: Minimum allowed value for the column maximum. :param max_value: Maximum allowed value for the column maximum. + :param tags: Optional tags as list of strings in "key:value" format. :return: A configured expectation instance for maximum values. """ return ExpectationColumnQuantileBetween( @@ -400,6 +409,7 @@ def create_expectation_column_max_to_be_between( quantile=1.0, min_value=min_value, max_value=max_value, + tags=tags, ) @@ -424,6 +434,7 @@ def create_expectation_column_min_to_be_between( column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> ExpectationColumnQuantileBetween: """ Create an ExpectationColumnQuantileBetween instance for minimum values (quantile=0.0). @@ -431,6 +442,7 @@ def create_expectation_column_min_to_be_between( :param column_name: Name of the column to check. :param min_value: Minimum allowed value for the column minimum. :param max_value: Maximum allowed value for the column minimum. + :param tags: Optional tags as list of strings in "key:value" format. :return: A configured expectation instance for minimum values. """ return ExpectationColumnQuantileBetween( @@ -438,6 +450,7 @@ def create_expectation_column_min_to_be_between( quantile=0.0, min_value=min_value, max_value=max_value, + tags=tags, ) @@ -462,6 +475,7 @@ def create_expectation_column_mean_to_be_between( column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> ExpectationColumnMeanBetween: """ Create a custom ExpectationColumnMeanBetween instance for mean values. @@ -470,6 +484,7 @@ def create_expectation_column_mean_to_be_between( :param column_name: Name of the column to check. :param min_value: Minimum allowed value for the column mean. :param max_value: Maximum allowed value for the column mean. + :param tags: Optional tags as list of strings in "key:value" format. :return: A configured expectation instance for mean values. """ # For mean, we need a separate class since it's not a quantile @@ -477,6 +492,7 @@ def create_expectation_column_mean_to_be_between( column_name=column_name, min_value=min_value, max_value=max_value, + tags=tags, ) @@ -501,6 +517,7 @@ def create_expectation_column_median_to_be_between( column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> ExpectationColumnQuantileBetween: """ Create an ExpectationColumnQuantileBetween instance for median values (quantile=0.5). @@ -508,6 +525,7 @@ def create_expectation_column_median_to_be_between( :param column_name: Name of the column to check. :param min_value: Minimum allowed value for the column median. :param max_value: Maximum allowed value for the column median. + :param tags: Optional tags as list of strings in "key:value" format. :return: A configured expectation instance for median values. """ return ExpectationColumnQuantileBetween( @@ -515,4 +533,5 @@ def create_expectation_column_median_to_be_between( quantile=0.5, min_value=min_value, max_value=max_value, + tags=tags, ) diff --git a/dataframe_expectations/expectations/aggregation/unique.py b/dataframe_expectations/expectations/aggregation/unique.py index 247e238..5233824 100644 --- a/dataframe_expectations/expectations/aggregation/unique.py +++ b/dataframe_expectations/expectations/aggregation/unique.py @@ -1,4 +1,4 @@ -from typing import List, cast +from typing import List, Optional, cast import pandas as pd from pandas import DataFrame as PandasDataFrame @@ -53,7 +53,7 @@ class ExpectationUniqueRows(DataFrameAggregationExpectation): """ - def __init__(self, column_names: List[str]): + def __init__(self, column_names: List[str], tags: Optional[List[str]] = None): """ Initialize the unique expectation. @@ -72,6 +72,7 @@ def __init__(self, column_names: List[str]): expectation_name="ExpectationUniqueRows", column_names=column_names, description=description, + tags=tags, ) def aggregate_and_validate_pandas( @@ -192,7 +193,7 @@ class ExpectationDistinctColumnValuesEquals(DataFrameAggregationExpectation): Note: The comparison is exact equality (inclusive). """ - def __init__(self, column_name: str, expected_value: int): + def __init__(self, column_name: str, expected_value: int, tags: Optional[List[str]] = None): """ Initialize the distinct values equals expectation. @@ -211,6 +212,7 @@ def __init__(self, column_name: str, expected_value: int): expectation_name="ExpectationDistinctColumnValuesEquals", column_names=[column_name], description=description, + tags=tags, ) def aggregate_and_validate_pandas( @@ -285,7 +287,7 @@ class ExpectationDistinctColumnValuesLessThan(DataFrameAggregationExpectation): Note: The threshold is exclusive (actual_count < threshold). """ - def __init__(self, column_name: str, threshold: int): + def __init__(self, column_name: str, threshold: int, tags: Optional[List[str]] = None): """ Initialize the distinct values less than expectation. @@ -304,6 +306,7 @@ def __init__(self, column_name: str, threshold: int): expectation_name="ExpectationDistinctColumnValuesLessThan", column_names=[column_name], description=description, + tags=tags, ) def aggregate_and_validate_pandas( @@ -378,7 +381,7 @@ class ExpectationDistinctColumnValuesGreaterThan(DataFrameAggregationExpectation Note: The threshold is exclusive (actual_count > threshold). """ - def __init__(self, column_name: str, threshold: int): + def __init__(self, column_name: str, threshold: int, tags: Optional[List[str]] = None): """ Initialize the distinct values greater than expectation. @@ -397,6 +400,7 @@ def __init__(self, column_name: str, threshold: int): expectation_name="ExpectationDistinctColumnValuesGreaterThan", column_names=[column_name], description=description, + tags=tags, ) def aggregate_and_validate_pandas( @@ -471,7 +475,9 @@ class ExpectationDistinctColumnValuesBetween(DataFrameAggregationExpectation): Note: Both bounds are inclusive (min_value ≤ actual_count ≤ max_value). """ - def __init__(self, column_name: str, min_value: int, max_value: int): + def __init__( + self, column_name: str, min_value: int, max_value: int, tags: Optional[List[str]] = None + ): """ Initialize the distinct values between expectation. @@ -498,6 +504,7 @@ def __init__(self, column_name: str, min_value: int, max_value: int): expectation_name="ExpectationDistinctColumnValuesBetween", column_names=[column_name], description=description, + tags=tags, ) def aggregate_and_validate_pandas( @@ -570,15 +577,17 @@ def aggregate_and_validate_pyspark( @requires_params("column_names", types={"column_names": list}) def create_expectation_unique( column_names: List[str], + tags: Optional[List[str]] = None, ) -> ExpectationUniqueRows: """ Create an ExpectationUniqueRows instance. :param column_names: List of column names to check for uniqueness. If empty, checks all columns. + :param tags: Optional list of tags for filtering expectations. :return: ExpectationUniqueRows instance """ column_names = column_names - return ExpectationUniqueRows(column_names=column_names) + return ExpectationUniqueRows(column_names=column_names, tags=tags) @register_expectation( @@ -599,17 +608,20 @@ def create_expectation_unique( def create_expectation_distinct_column_values_equals( column_name: str, expected_value: int, + tags: Optional[List[str]] = None, ) -> ExpectationDistinctColumnValuesEquals: """ Create an ExpectationDistinctColumnValuesEquals instance. :param column_name: Name of the column to check. :param expected_value: Expected number of distinct values. + :param tags: Optional list of tags for filtering expectations. :return: A configured expectation instance. """ return ExpectationDistinctColumnValuesEquals( column_name=column_name, expected_value=expected_value, + tags=tags, ) @@ -631,17 +643,20 @@ def create_expectation_distinct_column_values_equals( def create_expectation_distinct_column_values_less_than( column_name: str, threshold: int, + tags: Optional[List[str]] = None, ) -> ExpectationDistinctColumnValuesLessThan: """ Create an ExpectationDistinctColumnValuesLessThan instance. :param column_name: Name of the column to check. :param threshold: Threshold for distinct values count (exclusive upper bound). + :param tags: Optional list of tags for filtering expectations. :return: A configured expectation instance. """ return ExpectationDistinctColumnValuesLessThan( column_name=column_name, threshold=threshold, + tags=tags, ) @@ -663,17 +678,20 @@ def create_expectation_distinct_column_values_less_than( def create_expectation_distinct_column_values_greater_than( column_name: str, threshold: int, + tags: Optional[List[str]] = None, ) -> ExpectationDistinctColumnValuesGreaterThan: """ Create an ExpectationDistinctColumnValuesGreaterThan instance. :param column_name: Name of the column to check. :param threshold: Threshold for distinct values count (exclusive lower bound). + :param tags: Optional list of tags for filtering expectations. :return: A configured expectation instance. """ return ExpectationDistinctColumnValuesGreaterThan( column_name=column_name, threshold=threshold, + tags=tags, ) @@ -698,6 +716,7 @@ def create_expectation_distinct_column_values_between( column_name: str, min_value: int, max_value: int, + tags: Optional[List[str]] = None, ) -> ExpectationDistinctColumnValuesBetween: """ Create an ExpectationDistinctColumnValuesBetween instance. @@ -705,10 +724,12 @@ def create_expectation_distinct_column_values_between( :param column_name: Name of the column to check. :param min_value: Minimum number of distinct values (inclusive lower bound). :param max_value: Maximum number of distinct values (inclusive upper bound). + :param tags: Optional key-value tags for this expectation. :return: A configured expectation instance. """ return ExpectationDistinctColumnValuesBetween( column_name=column_name, min_value=min_value, max_value=max_value, + tags=tags, ) diff --git a/dataframe_expectations/expectations/column/any_value.py b/dataframe_expectations/expectations/column/any_value.py index b60c877..7500423 100644 --- a/dataframe_expectations/expectations/column/any_value.py +++ b/dataframe_expectations/expectations/column/any_value.py @@ -1,3 +1,5 @@ +from typing import List, Optional + from pyspark.sql import functions as F from dataframe_expectations.core.column_expectation import ( @@ -22,7 +24,9 @@ }, ) @requires_params("column_name", "value", types={"column_name": str, "value": object}) -def create_expectation_value_equals(column_name: str, value: object) -> DataFrameColumnExpectation: +def create_expectation_value_equals( + column_name: str, value: object, tags: Optional[List[str]] = None +) -> DataFrameColumnExpectation: column_name = column_name value = value return DataFrameColumnExpectation( @@ -32,6 +36,7 @@ def create_expectation_value_equals(column_name: str, value: object) -> DataFram fn_violations_pyspark=lambda df: df.filter(F.col(column_name) != value), description=f"'{column_name}' equals {value}", error_message=f"'{column_name}' is not equal to {value}.", + tags=tags, ) @@ -47,7 +52,7 @@ def create_expectation_value_equals(column_name: str, value: object) -> DataFram ) @requires_params("column_name", "value", types={"column_name": str, "value": object}) def create_expectation_value_not_equals( - column_name: str, value: object + column_name: str, value: object, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name value = value @@ -58,6 +63,7 @@ def create_expectation_value_not_equals( fn_violations_pyspark=lambda df: df.filter(F.col(column_name) == value), description=f"'{column_name}' is not equal to {value}", error_message=f"'{column_name}' is equal to {value}.", + tags=tags, ) @@ -71,7 +77,9 @@ def create_expectation_value_not_equals( }, ) @requires_params("column_name", types={"column_name": str}) -def create_expectation_value_null(column_name: str) -> DataFrameColumnExpectation: +def create_expectation_value_null( + column_name: str, tags: Optional[List[str]] = None +) -> DataFrameColumnExpectation: column_name = column_name return DataFrameColumnExpectation( expectation_name="ExpectationValueNull", @@ -80,6 +88,7 @@ def create_expectation_value_null(column_name: str) -> DataFrameColumnExpectatio fn_violations_pyspark=lambda df: df.filter(F.col(column_name).isNotNull()), description=f"'{column_name}' is null", error_message=f"'{column_name}' is not null.", + tags=tags, ) @@ -93,7 +102,9 @@ def create_expectation_value_null(column_name: str) -> DataFrameColumnExpectatio }, ) @requires_params("column_name", types={"column_name": str}) -def create_expectation_value_not_null(column_name: str) -> DataFrameColumnExpectation: +def create_expectation_value_not_null( + column_name: str, tags: Optional[List[str]] = None +) -> DataFrameColumnExpectation: column_name = column_name return DataFrameColumnExpectation( expectation_name="ExpectationValueNotNull", @@ -102,6 +113,7 @@ def create_expectation_value_not_null(column_name: str) -> DataFrameColumnExpect fn_violations_pyspark=lambda df: df.filter(F.col(column_name).isNull()), description=f"'{column_name}' is not null", error_message=f"'{column_name}' is null.", + tags=tags, ) @@ -116,7 +128,9 @@ def create_expectation_value_not_null(column_name: str) -> DataFrameColumnExpect }, ) @requires_params("column_name", "values", types={"column_name": str, "values": list}) -def create_expectation_value_in(column_name: str, values: list) -> DataFrameColumnExpectation: +def create_expectation_value_in( + column_name: str, values: list, tags: Optional[List[str]] = None +) -> DataFrameColumnExpectation: column_name = column_name values = values return DataFrameColumnExpectation( @@ -126,6 +140,7 @@ def create_expectation_value_in(column_name: str, values: list) -> DataFrameColu fn_violations_pyspark=lambda df: df.filter(~F.col(column_name).isin(values)), description=f"'{column_name}' is in {values}", error_message=f"'{column_name}' is not in {values}.", + tags=tags, ) @@ -140,7 +155,9 @@ def create_expectation_value_in(column_name: str, values: list) -> DataFrameColu }, ) @requires_params("column_name", "values", types={"column_name": str, "values": list}) -def create_expectation_value_not_in(column_name: str, values: list) -> DataFrameColumnExpectation: +def create_expectation_value_not_in( + column_name: str, values: list, tags: Optional[List[str]] = None +) -> DataFrameColumnExpectation: column_name = column_name values = values return DataFrameColumnExpectation( @@ -150,4 +167,5 @@ def create_expectation_value_not_in(column_name: str, values: list) -> DataFrame fn_violations_pyspark=lambda df: df.filter(F.col(column_name).isin(values)), description=f"'{column_name}' is not in {values}", error_message=f"'{column_name}' is in {values}.", + tags=tags, ) diff --git a/dataframe_expectations/expectations/column/numerical.py b/dataframe_expectations/expectations/column/numerical.py index 4899949..02ebc65 100644 --- a/dataframe_expectations/expectations/column/numerical.py +++ b/dataframe_expectations/expectations/column/numerical.py @@ -1,3 +1,5 @@ +from typing import List, Optional + from pyspark.sql import functions as F from dataframe_expectations.core.column_expectation import ( @@ -23,7 +25,7 @@ ) @requires_params("column_name", "value", types={"column_name": str, "value": (int, float)}) def create_expectation_value_greater_than( - column_name: str, value: float + column_name: str, value: float, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name value = value @@ -34,6 +36,7 @@ def create_expectation_value_greater_than( fn_violations_pyspark=lambda df: df.filter(F.col(column_name) <= value), description=f"'{column_name}' is greater than {value}", error_message=f"'{column_name}' is not greater than {value}.", + tags=tags, ) @@ -49,7 +52,7 @@ def create_expectation_value_greater_than( ) @requires_params("column_name", "value", types={"column_name": str, "value": (int, float)}) def create_expectation_value_less_than( - column_name: str, value: float + column_name: str, value: float, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name value = value @@ -60,6 +63,7 @@ def create_expectation_value_less_than( fn_violations_pyspark=lambda df: df.filter(F.col(column_name) >= value), description=f"'{column_name}' is less than {value}", error_message=f"'{column_name}' is not less than {value}.", + tags=tags, ) @@ -85,7 +89,7 @@ def create_expectation_value_less_than( }, ) def create_expectation_value_between( - column_name: str, min_value: float, max_value: float + column_name: str, min_value: float, max_value: float, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name min_value = min_value @@ -101,4 +105,5 @@ def create_expectation_value_between( ), description=f"'{column_name}' is between {min_value} and {max_value}", error_message=f"'{column_name}' is not between {min_value} and {max_value}.", + tags=tags, ) diff --git a/dataframe_expectations/expectations/column/string.py b/dataframe_expectations/expectations/column/string.py index f50a75e..5edb962 100644 --- a/dataframe_expectations/expectations/column/string.py +++ b/dataframe_expectations/expectations/column/string.py @@ -1,3 +1,4 @@ +from typing import List, Optional from pyspark.sql import functions as F from dataframe_expectations.core.column_expectation import ( @@ -23,7 +24,7 @@ ) @requires_params("column_name", "substring", types={"column_name": str, "substring": str}) def create_expectation_string_contains( - column_name: str, substring: str + column_name: str, substring: str, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name substring = substring @@ -34,6 +35,7 @@ def create_expectation_string_contains( fn_violations_pyspark=lambda df: df.filter(~F.col(column_name).contains(substring)), description=f"'{column_name}' contains '{substring}'", error_message=f"'{column_name}' does not contain '{substring}'.", + tags=tags, ) @@ -49,7 +51,7 @@ def create_expectation_string_contains( ) @requires_params("column_name", "substring", types={"column_name": str, "substring": str}) def create_expectation_string_not_contains( - column_name: str, substring: str + column_name: str, substring: str, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name substring = substring @@ -60,6 +62,7 @@ def create_expectation_string_not_contains( fn_violations_pyspark=lambda df: df.filter(F.col(column_name).contains(substring)), description=f"'{column_name}' does not contain '{substring}'", error_message=f"'{column_name}' contains '{substring}'.", + tags=tags, ) @@ -75,7 +78,7 @@ def create_expectation_string_not_contains( ) @requires_params("column_name", "prefix", types={"column_name": str, "prefix": str}) def create_expectation_string_starts_with( - column_name: str, prefix: str + column_name: str, prefix: str, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name prefix = prefix @@ -86,6 +89,7 @@ def create_expectation_string_starts_with( fn_violations_pyspark=lambda df: df.filter(~F.col(column_name).startswith(prefix)), description=f"'{column_name}' starts with '{prefix}'", error_message=f"'{column_name}' does not start with '{prefix}'.", + tags=tags, ) @@ -101,7 +105,7 @@ def create_expectation_string_starts_with( ) @requires_params("column_name", "suffix", types={"column_name": str, "suffix": str}) def create_expectation_string_ends_with( - column_name: str, suffix: str + column_name: str, suffix: str, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name suffix = suffix @@ -112,6 +116,7 @@ def create_expectation_string_ends_with( fn_violations_pyspark=lambda df: df.filter(~F.col(column_name).endswith(suffix)), description=f"'{column_name}' ends with '{suffix}'", error_message=f"'{column_name}' does not end with '{suffix}'.", + tags=tags, ) @@ -127,7 +132,7 @@ def create_expectation_string_ends_with( ) @requires_params("column_name", "length", types={"column_name": str, "length": int}) def create_expectation_string_length_less_than( - column_name: str, length: int + column_name: str, length: int, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name length = length @@ -138,6 +143,7 @@ def create_expectation_string_length_less_than( fn_violations_pyspark=lambda df: df.filter(F.length(column_name) >= length), description=f"'{column_name}' length is less than {length}", error_message=f"'{column_name}' length is not less than {length}.", + tags=tags, ) @@ -153,7 +159,7 @@ def create_expectation_string_length_less_than( ) @requires_params("column_name", "length", types={"column_name": str, "length": int}) def create_expectation_string_length_greater_than( - column_name: str, length: int + column_name: str, length: int, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name length = length @@ -164,6 +170,7 @@ def create_expectation_string_length_greater_than( fn_violations_pyspark=lambda df: df.filter(F.length(F.col(column_name)) <= length), description=f"'{column_name}' length is greater than {length}", error_message=f"'{column_name}' length is not greater than {length}.", + tags=tags, ) @@ -185,7 +192,7 @@ def create_expectation_string_length_greater_than( types={"column_name": str, "min_length": int, "max_length": int}, ) def create_expectation_string_length_between( - column_name: str, min_length: int, max_length: int + column_name: str, min_length: int, max_length: int, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name min_length = min_length @@ -202,6 +209,7 @@ def create_expectation_string_length_between( ), description=f"'{column_name}' length is between {min_length} and {max_length}", error_message=f"'{column_name}' length is not between {min_length} and {max_length}.", + tags=tags, ) @@ -217,7 +225,7 @@ def create_expectation_string_length_between( ) @requires_params("column_name", "length", types={"column_name": str, "length": int}) def create_expectation_string_length_equals( - column_name: str, length: int + column_name: str, length: int, tags: Optional[List[str]] = None ) -> DataFrameColumnExpectation: column_name = column_name length = length @@ -228,4 +236,5 @@ def create_expectation_string_length_equals( fn_violations_pyspark=lambda df: df.filter(F.length(F.col(column_name)) != length), description=f"'{column_name}' length equals {length}", error_message=f"'{column_name}' length is not equal to {length}.", + tags=tags, ) diff --git a/dataframe_expectations/logging_utils.py b/dataframe_expectations/logging_utils.py deleted file mode 100644 index f2f74e6..0000000 --- a/dataframe_expectations/logging_utils.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging - - -def setup_logger(name=None): - """Sets up the logger for the entire run.""" - # Suppress verbose logs from py4j - logging.getLogger("py4j").setLevel(logging.ERROR) - logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR) - - # Create or get a logger - logger = logging.getLogger(name) - logger.setLevel(logging.INFO) # Set the default log level - logger.propagate = False # Disable logger propagation to prevent duplicate logs - DATE_FORMAT = "%Y-%m-%d %H:%M:%S" - MSG_FORMAT = "%(asctime)s %(levelname)-8s [%(filename)s:%(funcName)s():%(lineno)d] %(message)s" - - # Check if the logger already has handlers to avoid duplicate logs - if not logger.hasHandlers(): - # Create a console handler - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - - # Create a formatter and set it for the handler - formatter = logging.Formatter(MSG_FORMAT, DATE_FORMAT) - console_handler.setFormatter(formatter) - - # Add the handler to the logger - logger.addHandler(console_handler) - - return logger diff --git a/dataframe_expectations/registry.py b/dataframe_expectations/registry.py index 4581589..18277ac 100644 --- a/dataframe_expectations/registry.py +++ b/dataframe_expectations/registry.py @@ -7,9 +7,9 @@ ExpectationMetadata, ExpectationSubcategory, ) -from dataframe_expectations.logging_utils import setup_logger +import logging -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # Type alias for registry entry (factory function + metadata) FactoryFunction = Callable[..., DataFrameExpectation] diff --git a/dataframe_expectations/suite.py b/dataframe_expectations/suite.py index e80481f..d844b1d 100644 --- a/dataframe_expectations/suite.py +++ b/dataframe_expectations/suite.py @@ -1,17 +1,21 @@ from functools import wraps -from typing import Callable, List, Optional, cast +from typing import Any, Callable, Dict, List, Optional, cast -from dataframe_expectations.core.types import DataFrameLike +from dataframe_expectations.core.types import DataFrameLike, TagMatchMode +from dataframe_expectations.core.tagging import TagSet from dataframe_expectations.registry import ( DataFrameExpectationRegistry, ) -from dataframe_expectations.logging_utils import setup_logger +from dataframe_expectations.core.expectation import DataFrameExpectation +import logging + from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, DataFrameExpectationSuccessMessage, ) +from dataframe_expectations.core.suite_result import SuiteExecutionResult -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) class DataFrameExpectationsSuiteFailure(Exception): @@ -21,10 +25,12 @@ def __init__( self, total_expectations: int, failures: List[DataFrameExpectationFailureMessage], + result: Optional[SuiteExecutionResult] = None, *args, ): self.failures = failures self.total_expectations = total_expectations + self.result = result super().__init__(*args) def __str__(self): @@ -48,29 +54,147 @@ def __str__(self): class DataFrameExpectationsSuiteRunner: """ Immutable runner for executing a fixed set of expectations. - - This class is created by DataFrameExpectationsSuite.build() and contains - a snapshot of expectations that won't change during execution. + This class is created by DataFrameExpectationsSuite.build() and + runs the expectations on provided DataFrames. """ - def __init__(self, expectations: List): + @staticmethod + def _matches_tag_filter( + expectation: Any, + filter_tag_set: TagSet, + tag_match_mode: TagMatchMode, + ) -> bool: """ - Initialize the runner with a list of expectations. + Check if an expectation matches the tag filter criteria. + + :param expectation: Expectation instance to check. + :param filter_tag_set: Tag filter to match against. + :param tag_match_mode: Match mode - TagMatchMode.ANY (OR) or TagMatchMode.ALL (AND). + :return: True if expectation matches filter, False otherwise. + """ + exp_tag_set = expectation.get_tags() + + # Check if expectation matches filter + match tag_match_mode: + case TagMatchMode.ANY: + return exp_tag_set.has_any_tag_from(filter_tag_set) + case TagMatchMode.ALL: + return exp_tag_set.has_all_tags_from(filter_tag_set) - :param expectations: List of expectation instances to run. + def __init__( + self, + expectations: List[Any], + suite_name: Optional[str] = None, + violation_sample_limit: int = 5, + tags: Optional[List[str]] = None, + tag_match_mode: Optional[TagMatchMode] = None, + ): + """ + Initialize the runner with a list of expectations and metadata. + + :param expectations: List of expectation instances. + :param suite_name: Optional name for the suite. + :param violation_sample_limit: Max number of violation rows to include in results. + :param tags: Optional tag filters as list of strings in "key:value" format. + Example: ["priority:high", "priority:medium"] + If None or empty, all expectations will run. + :param tag_match_mode: How to match tags - TagMatchMode.ANY (OR logic) or TagMatchMode.ALL (AND logic). + Required if tags are provided, must be None if tags are not provided. + - TagMatchMode.ANY: Expectation matches if it has ANY of the filter tags + - TagMatchMode.ALL: Expectation matches if it has ALL of the filter tags + :raises ValueError: If tag_match_mode is provided without tags, or if tags are provided without tag_match_mode, + or if tag filters result in zero expectations to run. """ - self.__expectations = tuple(expectations) # Immutable tuple + self.__all_expectations = tuple(expectations) # Store all expectations + + # Create filter TagSet from tags list + self.__filter_tag_set = TagSet(tags) + + # Validate tags and tag_match_mode relationship + if self.__filter_tag_set.is_empty() and tag_match_mode is not None: + raise ValueError( + "tag_match_mode cannot be provided when no tags are specified. " + "Either provide tags or set tag_match_mode to None." + ) + + if not self.__filter_tag_set.is_empty() and tag_match_mode is None: + raise ValueError( + "tag_match_mode must be specified (TagMatchMode.ANY or TagMatchMode.ALL) when tags are provided." + ) + + self.__tag_match_mode = tag_match_mode + + # Filter expectations based on tags and track skipped ones + if not self.__filter_tag_set.is_empty(): + # At this point, validation ensures tag_match_mode is not None + # This check is for type narrowing (mypy/pyright) + if tag_match_mode is None: + # This should never happen due to validation above, but satisfies type checker + raise ValueError( + "tag_match_mode must be specified (TagMatchMode.ANY or TagMatchMode.ALL) when tags are provided." + ) + + filtered = [] + skipped = [] + for exp in self.__all_expectations: + if self._matches_tag_filter(exp, self.__filter_tag_set, tag_match_mode): + filtered.append(exp) + else: + skipped.append(exp) + + self.__expectations = tuple(filtered) + self.__skipped_expectations = tuple(skipped) + + # Raise error if all expectations were filtered out + if len(self.__expectations) == 0: + error_message = ( + f"Tag filter {self.__filter_tag_set} with mode '{tag_match_mode}' resulted in zero expectations to run. " + f"All {len(self.__all_expectations)} expectations were skipped. " + f"Please adjust your filter criteria." + ) + logger.error(error_message) + raise ValueError(error_message) + + logger.debug( + f"Filtered {len(self.__all_expectations)} expectations to {len(self.__expectations)} " + f"matching tags: {self.__filter_tag_set} (mode: {tag_match_mode}). Skipped: {len(self.__skipped_expectations)}" + ) + else: + self.__expectations = self.__all_expectations + self.__skipped_expectations = tuple() # No expectations skipped + + self.__suite_name = suite_name + self.__violation_sample_limit = violation_sample_limit @property - def expectation_count(self) -> int: - """Return the number of expectations in this runner.""" + def selected_expectations_count(self) -> int: + """Return the number of expectations that will run (after filtering).""" return len(self.__expectations) - def list_expectations(self) -> List[str]: + @property + def total_expectations(self) -> int: + """Return the total number of expectations before filtering.""" + return len(self.__all_expectations) + + @property + def get_applied_tags(self) -> TagSet: + """Return the applied tag filters for this runner.""" + return self.__filter_tag_set + + def list_all_expectations(self) -> List[str]: """ - Return a list of expectation descriptions in this runner. + Return a list of all expectation descriptions before filtering. - :return: List of expectation descriptions as strings in the format: + :return: List of all expectation descriptions as strings in the format: + "ExpectationName (description)" + """ + return [f"{exp}" for exp in self.__all_expectations] + + def list_selected_expectations(self) -> List[str]: + """ + Return a list of selected expectation descriptions (after filtering). + + :return: List of selected expectation descriptions as strings in the format: "ExpectationName (description)" """ return [f"{exp}" for exp in self.__expectations] @@ -78,17 +202,33 @@ def list_expectations(self) -> List[str]: def run( self, data_frame: DataFrameLike, - ) -> None: + raise_on_failure: bool = True, + context: Optional[Dict[str, Any]] = None, + ) -> SuiteExecutionResult: """ Run all expectations on the provided DataFrame with PySpark caching optimization. :param data_frame: The DataFrame to validate. + :param raise_on_failure: If True (default), raises DataFrameExpectationsSuiteFailure on any failures. + If False, returns SuiteExecutionResult instead. + :param context: Optional runtime context metadata (e.g., {"job_id": "123", "env": "prod"}). + :return: None if raise_on_failure=True and all pass, SuiteExecutionResult if raise_on_failure=False. """ + from datetime import datetime from dataframe_expectations.core.types import DataFrameType - from dataframe_expectations.core.expectation import DataFrameExpectation + from dataframe_expectations.core.suite_result import ( + ExpectationResult, + SuiteExecutionResult, + serialize_violations, + ExpectationStatus, + ) + + # Track execution timing + start_time = datetime.now() successes = [] failures = [] + expectation_results = [] margin_len = 80 header_message = "Running expectations suite" @@ -101,6 +241,7 @@ def run( # PySpark caching optimization data_frame_type = DataFrameExpectation.infer_data_frame_type(data_frame) was_already_cached = False + dataframe_row_count = DataFrameExpectation.num_data_frame_rows(data_frame) if data_frame_type == DataFrameType.PYSPARK: from pyspark.sql import DataFrame as PySparkDataFrame @@ -118,20 +259,52 @@ def run( # Run all expectations for expectation in self.__expectations: result = expectation.validate(data_frame=data_frame) - if isinstance(result, DataFrameExpectationSuccessMessage): - logger.info( - f"{expectation.get_expectation_name()} ({expectation.get_description()}) ... OK" - ) - successes.append(result) - elif isinstance(result, DataFrameExpectationFailureMessage): - logger.info( - f"{expectation.get_expectation_name()} ({expectation.get_description()}) ... FAIL" - ) - failures.append(result) - else: - raise ValueError( - f"Unexpected result type: {type(result)} for expectation: {expectation.get_expectation_name()}" - ) + # Get expectation's tags as TagSet + exp_tag_set = expectation.get_tags() + + # Build ExpectationResult object using pattern matching + match result: + case DataFrameExpectationSuccessMessage(): + logger.debug( + f"{expectation.get_expectation_name()} ({expectation.get_description()}) ... OK" + ) + successes.append(result) + expectation_results.append( + ExpectationResult( + expectation_name=expectation.get_expectation_name(), + description=expectation.get_description(), + status=ExpectationStatus.PASSED, + tags=exp_tag_set, + error_message=None, + violation_count=None, + violation_sample=None, + ) + ) + case DataFrameExpectationFailureMessage(): + logger.warning( + f"{expectation.get_expectation_name()} ({expectation.get_description()}) ... FAIL" + ) + failures.append(result) + # Serialize violations without storing raw dataframes + violations_df = result.get_violations_data_frame() + violation_count, violation_sample = serialize_violations( + violations_df, data_frame_type, self.__violation_sample_limit + ) + expectation_results.append( + ExpectationResult( + expectation_name=expectation.get_expectation_name(), + description=expectation.get_description(), + status=ExpectationStatus.FAILED, + tags=exp_tag_set, + error_message=str(result), + violation_count=violation_count, + violation_sample=violation_sample, + ) + ) + case _: + raise ValueError( + f"Unexpected result type: {type(result)} for expectation: {expectation.get_expectation_name()}" + ) finally: # Uncache the DataFrame if we cached it (and it wasn't already cached) if data_frame_type == DataFrameType.PYSPARK and not was_already_cached: @@ -140,6 +313,9 @@ def run( logger.debug("Uncaching PySpark DataFrame after expectations suite execution") cast(PySparkDataFrame, data_frame).unpersist() + # Track end time + end_time = datetime.now() + footer_message = f"{len(successes)} success, {len(failures)} failures" footer_prefix = "=" * ((margin_len - len(footer_message) - 2) // 2) footer_suffix = "=" * ( @@ -147,10 +323,48 @@ def run( ) logger.info(f"{footer_prefix} {footer_message} {footer_suffix}") - if len(failures) > 0: + # Build skipped expectations list + # Build skipped expectations as ExpectationResult with status="skipped" + skipped_list = [] + for exp in self.__skipped_expectations: + # Get expectation's tags as TagSet + exp_tag_set = exp.get_tags() + skipped_list.append( + ExpectationResult( + expectation_name=exp.get_expectation_name(), + description=exp.get_description(), + status=ExpectationStatus.SKIPPED, + tags=exp_tag_set, + error_message=None, + violation_count=None, + violation_sample=None, + ) + ) + + # Build result object + # Combine executed and skipped expectations + all_results = expectation_results + skipped_list + suite_result = SuiteExecutionResult( + suite_name=self.__suite_name, + context=context or {}, + applied_filters=self.__filter_tag_set, + tag_match_mode=self.__tag_match_mode if not self.__filter_tag_set.is_empty() else None, + results=all_results, + start_time=start_time, + end_time=end_time, + dataframe_type=data_frame_type, + dataframe_row_count=dataframe_row_count, + dataframe_was_cached=was_already_cached, + ) + + # Dual-mode execution: raise exception or return result + if len(failures) > 0 and raise_on_failure: raise DataFrameExpectationsSuiteFailure( - total_expectations=len(self.__expectations), failures=failures + total_expectations=len(self.__expectations), + failures=failures, + result=suite_result, ) + return suite_result def validate(self, func: Optional[Callable] = None, *, allow_none: bool = False) -> Callable: """ @@ -191,7 +405,7 @@ def wrapper(*args, **kwargs): # Handle None case if result is None: if allow_none: - logger.info( + logger.debug( f"Function '{f.__name__}' returned None, skipping validation (allow_none=True)" ) return None @@ -202,7 +416,7 @@ def wrapper(*args, **kwargs): ) # Validate the returned DataFrame - logger.info(f"Validating DataFrame returned from '{f.__name__}'") + logger.debug(f"Validating DataFrame returned from '{f.__name__}'") self.run(data_frame=result) return result @@ -226,20 +440,49 @@ class DataFrameExpectationsSuite: immutable runner that can execute the expectations on DataFrames. Example: - suite = DataFrameExpectationsSuite() - suite.expect_value_greater_than(column_name="age", value=18) - suite.expect_value_less_than(column_name="salary", value=100000) + suite = DataFrameExpectationsSuite(suite_name="user_validation") + suite.expect_value_greater_than( + column_name="age", + value=18, + tags=["priority:high", "category:compliance"] + ) + suite.expect_value_less_than( + column_name="salary", + value=100000, + tags=["priority:medium", "category:budget"] + ) + suite.expect_min_rows( + min_rows=10, + tags=["priority:low", "category:data_quality"] + ) + + # Build runner for all expectations (no filtering) + runner_all = suite.build() + runner_all.run(df) # Runs all 3 expectations - runner = suite.build() - runner.run(df1) - runner.run(df2) # Same expectations, different DataFrame + # Build runner for high OR medium priority expectations (OR logic) + runner_any = suite.build(tags=["priority:high", "priority:medium"], tag_match_mode=TagMatchMode.ANY) + runner_any.run(df) # Runs 2 expectations (age and salary checks) + + # Build runner for expectations with both high priority AND compliance category (AND logic) + runner_and = suite.build(tags=["priority:high", "category:compliance"], tag_match_mode=TagMatchMode.ALL) + runner_and.run(df) # Runs 1 expectation (age check - has both tags) """ - def __init__(self): + def __init__( + self, + suite_name: Optional[str] = None, + violation_sample_limit: int = 5, + ): """ Initialize the expectation suite builder. + + :param suite_name: Optional name for the suite (useful for logging/reporting). + :param violation_sample_limit: Max number of violation rows to include in results (default 5). """ - self.__expectations = [] + self.__expectations: list[Any] = [] # List of expectation instances + self.__suite_name = suite_name + self.__violation_sample_limit = violation_sample_limit def __getattr__(self, name: str): """ @@ -262,17 +505,23 @@ def _create_expectation_method(self, suite_method_name: str): Returns a closure that captures the suite_method_name and self. """ - def dynamic_method(**kwargs): - """Dynamically generated expectation method.""" + def dynamic_method(tags: Optional[List[str]] = None, **kwargs): + """Dynamically generated expectation method. + + :param tags: Optional tags as list of strings in "key:value" format. + Example: ["priority:high", "env:test"] + :param **kwargs: Parameters for the expectation. + """ try: expectation = DataFrameExpectationRegistry.get_expectation_by_suite_method( - suite_method_name=suite_method_name, **kwargs + suite_method_name=suite_method_name, tags=tags, **kwargs ) except ValueError as e: raise AttributeError(str(e)) from e - logger.info(f"Adding expectation: {expectation}") + logger.debug(f"Adding expectation: {expectation}") + # Store expectation instance self.__expectations.append(expectation) return self @@ -281,16 +530,28 @@ def dynamic_method(**kwargs): return dynamic_method - def build(self) -> DataFrameExpectationsSuiteRunner: + def build( + self, + tags: Optional[List[str]] = None, + tag_match_mode: Optional[TagMatchMode] = None, + ) -> DataFrameExpectationsSuiteRunner: """ Build an immutable runner from the current expectations. - The runner contains a snapshot of expectations at the time of building. + This creates a snapshot of the current expectations in the suite. You can continue to add more expectations to this suite and build new runners without affecting previously built runners. + :param tags: Optional tag filters as list of strings in "key:value" format. + Example: ["priority:high", "priority:medium"] + If None or empty, all expectations will be included. + :param tag_match_mode: How to match tags - TagMatchMode.ANY (OR logic) or TagMatchMode.ALL (AND logic). + Required if tags are provided, must be None if tags are not provided. + - TagMatchMode.ANY: Include expectations with ANY of the filter tags + - TagMatchMode.ALL: Include expectations with ALL of the filter tags :return: An immutable DataFrameExpectationsSuiteRunner instance. - :raises ValueError: If no expectations have been added. + :raises ValueError: If no expectations have been added, if tag_match_mode validation fails, + or if no expectations match the tag filters. """ if not self.__expectations: raise ValueError( @@ -299,53 +560,10 @@ def build(self) -> DataFrameExpectationsSuiteRunner: ) # Create a copy of expectations for the runner - return DataFrameExpectationsSuiteRunner(list(self.__expectations)) - - -if __name__ == "__main__": - import pandas as pd - - # Example 1: Direct usage - print("=== Example 1: Direct Usage ===") - suite = DataFrameExpectationsSuite() - suite.expect_value_greater_than(column_name="age", value=18) - suite.expect_value_less_than(column_name="salary", value=1000) - suite.expect_unique_rows(column_names=["id"]) - suite.expect_column_mean_between(column_name="age", min_value=20, max_value=40) - suite.expect_column_max_between(column_name="salary", min_value=80000, max_value=85000) - - # Create a sample DataFrame - df = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "age": [20, 25, 30, 35], - "salary": [50000, 90000, 80000, 85000], - } - ) - - # Build the runner and execute - runner = suite.build() - runner.run(data_frame=df) - - # Example 2: Decorator usage - print("\n=== Example 2: Decorator Usage ===") - suite = DataFrameExpectationsSuite() - suite.expect_value_greater_than(column_name="age", value=20) - suite.expect_unique_rows(column_names=["id"]) - - runner = suite.build() - - @runner.validate - def load_employee_data(): - """Load employee data with automatic validation.""" - return pd.DataFrame( - { - "id": [1, 2, 3], - "age": [18, 30, 35], - "name": ["Alice", "Bob", "Charlie"], - } + return DataFrameExpectationsSuiteRunner( + expectations=list(self.__expectations), + suite_name=self.__suite_name, + violation_sample_limit=self.__violation_sample_limit, + tags=tags, + tag_match_mode=tag_match_mode, ) - - # Function is automatically validated when called - validated_df = load_employee_data() - print(f"Successfully loaded and validated DataFrame with {len(validated_df)} rows") diff --git a/dataframe_expectations/suite.pyi b/dataframe_expectations/suite.pyi index de22e1c..e38c366 100644 --- a/dataframe_expectations/suite.pyi +++ b/dataframe_expectations/suite.pyi @@ -3,14 +3,18 @@ # DO NOT EDIT - Regenerate with: python scripts/generate_suite_stubs.py from functools import wraps -from typing import Union, Callable, List, Optional, cast -from dataframe_expectations.core.types import DataFrameLike +from typing import Union, Any, Callable, Dict, List, Optional, cast +from dataframe_expectations.core.types import DataFrameLike, TagMatchMode +from dataframe_expectations.core.tagging import TagSet from dataframe_expectations.registry import DataFrameExpectationRegistry +from dataframe_expectations.core.expectation import DataFrameExpectation +import logging from dataframe_expectations.result_message import DataFrameExpectationFailureMessage, DataFrameExpectationSuccessMessage +from dataframe_expectations.core.suite_result import SuiteExecutionResult class DataFrameExpectationsSuiteFailure(Exception): """Raised when one or more expectations in the suite fail.""" - def __init__(self, total_expectations: int, failures: List[DataFrameExpectationFailureMessage], *args): + def __init__(self, total_expectations: int, failures: List[DataFrameExpectationFailureMessage], result: Optional[SuiteExecutionResult]=None, *args): ... def __str__(self): ... @@ -18,41 +22,90 @@ class DataFrameExpectationsSuiteFailure(Exception): class DataFrameExpectationsSuiteRunner: """ Immutable runner for executing a fixed set of expectations. - - This class is created by DataFrameExpectationsSuite.build() and contains - a snapshot of expectations that won't change during execution. + This class is created by DataFrameExpectationsSuite.build() and + runs the expectations on provided DataFrames. """ - def __init__(self, expectations: List): + @staticmethod + def _matches_tag_filter(expectation: Any, filter_tag_set: TagSet, tag_match_mode: TagMatchMode) -> bool: + """ + + Check if an expectation matches the tag filter criteria. + + :param expectation: Expectation instance to check. + :param filter_tag_set: Tag filter to match against. + :param tag_match_mode: Match mode - TagMatchMode.ANY (OR) or TagMatchMode.ALL (AND). + :return: True if expectation matches filter, False otherwise. + + """ + ... + def __init__(self, expectations: List[Any], suite_name: Optional[str]=None, violation_sample_limit: int=5, tags: Optional[List[str]]=None, tag_match_mode: Optional[TagMatchMode]=None): """ - Initialize the runner with a list of expectations. + Initialize the runner with a list of expectations and metadata. - :param expectations: List of expectation instances to run. + :param expectations: List of expectation instances. + :param suite_name: Optional name for the suite. + :param violation_sample_limit: Max number of violation rows to include in results. + :param tags: Optional tag filters as list of strings in "key:value" format. + Example: ["priority:high", "priority:medium"] + If None or empty, all expectations will run. + :param tag_match_mode: How to match tags - TagMatchMode.ANY (OR logic) or TagMatchMode.ALL (AND logic). + Required if tags are provided, must be None if tags are not provided. + - TagMatchMode.ANY: Expectation matches if it has ANY of the filter tags + - TagMatchMode.ALL: Expectation matches if it has ALL of the filter tags + :raises ValueError: If tag_match_mode is provided without tags, or if tags are provided without tag_match_mode, + or if tag filters result in zero expectations to run. """ ... @property - def expectation_count(self) -> int: + def selected_expectations_count(self) -> int: + """ + Return the number of expectations that will run (after filtering). + """ + ... + @property + def total_expectations(self) -> int: + """ + Return the total number of expectations before filtering. + """ + ... + @property + def get_applied_tags(self) -> TagSet: + """ + Return the applied tag filters for this runner. + """ + ... + def list_all_expectations(self) -> List[str]: """ - Return the number of expectations in this runner. + + Return a list of all expectation descriptions before filtering. + + :return: List of all expectation descriptions as strings in the format: + "ExpectationName (description)" + """ ... - def list_expectations(self) -> List[str]: + def list_selected_expectations(self) -> List[str]: """ - Return a list of expectation descriptions in this runner. + Return a list of selected expectation descriptions (after filtering). - :return: List of expectation descriptions as strings in the format: + :return: List of selected expectation descriptions as strings in the format: "ExpectationName (description)" """ ... - def run(self, data_frame: DataFrameLike) -> None: + def run(self, data_frame: DataFrameLike, raise_on_failure: bool=True, context: Optional[Dict[str, Any]]=None) -> SuiteExecutionResult: """ Run all expectations on the provided DataFrame with PySpark caching optimization. :param data_frame: The DataFrame to validate. + :param raise_on_failure: If True (default), raises DataFrameExpectationsSuiteFailure on any failures. + If False, returns SuiteExecutionResult instead. + :param context: Optional runtime context metadata (e.g., {"job_id": "123", "env": "prod"}). + :return: None if raise_on_failure=True and all pass, SuiteExecutionResult if raise_on_failure=False. """ ... @@ -97,19 +150,42 @@ class DataFrameExpectationsSuite: immutable runner that can execute the expectations on DataFrames. Example: - suite = DataFrameExpectationsSuite() - suite.expect_value_greater_than(column_name="age", value=18) - suite.expect_value_less_than(column_name="salary", value=100000) - - runner = suite.build() - runner.run(df1) - runner.run(df2) # Same expectations, different DataFrame + suite = DataFrameExpectationsSuite(suite_name="user_validation") + suite.expect_value_greater_than( + column_name="age", + value=18, + tags=["priority:high", "category:compliance"] + ) + suite.expect_value_less_than( + column_name="salary", + value=100000, + tags=["priority:medium", "category:budget"] + ) + suite.expect_min_rows( + min_rows=10, + tags=["priority:low", "category:data_quality"] + ) + + # Build runner for all expectations (no filtering) + runner_all = suite.build() + runner_all.run(df) # Runs all 3 expectations + + # Build runner for high OR medium priority expectations (OR logic) + runner_any = suite.build(tags=["priority:high", "priority:medium"], tag_match_mode=TagMatchMode.ANY) + runner_any.run(df) # Runs 2 expectations (age and salary checks) + + # Build runner for expectations with both high priority AND compliance category (AND logic) + runner_and = suite.build(tags=["priority:high", "category:compliance"], tag_match_mode=TagMatchMode.ALL) + runner_and.run(df) # Runs 1 expectation (age check - has both tags) """ - def __init__(self): + def __init__(self, suite_name: Optional[str]=None, violation_sample_limit: int=5): """ Initialize the expectation suite builder. + :param suite_name: Optional name for the suite (useful for logging/reporting). + :param violation_sample_limit: Max number of violation rows to include in results (default 5). + """ ... @@ -118,6 +194,7 @@ class DataFrameExpectationsSuite: column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the maximum value of a numeric column falls within a specified range @@ -130,6 +207,8 @@ class DataFrameExpectationsSuite: :param min_value: The minimum allowed maximum value :param max_value: The maximum allowed maximum value + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -139,6 +218,7 @@ class DataFrameExpectationsSuite: column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the mean (average) of a numeric column falls within a specified range @@ -151,6 +231,8 @@ class DataFrameExpectationsSuite: :param min_value: The minimum allowed mean value :param max_value: The maximum allowed mean value + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -160,6 +242,7 @@ class DataFrameExpectationsSuite: column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the median of a numeric column falls within a specified range @@ -172,6 +255,8 @@ class DataFrameExpectationsSuite: :param min_value: The minimum allowed median value :param max_value: The maximum allowed median value + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -181,6 +266,7 @@ class DataFrameExpectationsSuite: column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the minimum value of a numeric column falls within a specified range @@ -193,6 +279,8 @@ class DataFrameExpectationsSuite: :param min_value: The minimum allowed minimum value :param max_value: The maximum allowed minimum value + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -203,6 +291,7 @@ class DataFrameExpectationsSuite: quantile: Union[int, float], min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if a specific quantile of a numeric column falls within a specified range @@ -216,6 +305,8 @@ class DataFrameExpectationsSuite: :param min_value: The minimum allowed value for the quantile :param max_value: The maximum allowed value for the quantile + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -225,6 +316,7 @@ class DataFrameExpectationsSuite: column_name: str, min_value: int, max_value: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if a column has a number of distinct values within a specified range @@ -237,6 +329,8 @@ class DataFrameExpectationsSuite: :param min_value: The minimum number of distinct values (inclusive) :param max_value: The maximum number of distinct values (inclusive) + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -245,6 +339,7 @@ class DataFrameExpectationsSuite: self, column_name: str, expected_value: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if a column has exactly a specified number of distinct values @@ -256,6 +351,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check for distinct values :param expected_value: The expected number of distinct values + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -264,6 +361,7 @@ class DataFrameExpectationsSuite: self, column_name: str, threshold: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if a column has at least a specified number of distinct values @@ -275,6 +373,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check for distinct values :param threshold: The minimum number of distinct values (exclusive) + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -283,6 +383,7 @@ class DataFrameExpectationsSuite: self, column_name: str, threshold: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if a column has at most a specified number of distinct values @@ -294,6 +395,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check for distinct values :param threshold: The maximum number of distinct values (exclusive) + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -302,6 +405,7 @@ class DataFrameExpectationsSuite: self, column_name: str, max_count: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the count of null/NaN values in a specific column is below a threshold @@ -313,6 +417,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check for null count :param max_count: The maximum allowed count of null/NaN values + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -321,6 +427,7 @@ class DataFrameExpectationsSuite: self, column_name: str, max_percentage: Union[int, float], + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the percentage of null/NaN values in a specific column is below a threshold @@ -332,6 +439,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check for null percentage :param max_percentage: The maximum allowed percentage of null/NaN values (0.0 to 100.0) + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -339,6 +448,7 @@ class DataFrameExpectationsSuite: def expect_max_rows( self, max_rows: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the DataFrame has at most a maximum number of rows @@ -349,6 +459,8 @@ class DataFrameExpectationsSuite: :param max_rows: The maximum number of rows expected + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -356,6 +468,7 @@ class DataFrameExpectationsSuite: def expect_min_rows( self, min_rows: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the DataFrame has at least a minimum number of rows @@ -366,6 +479,8 @@ class DataFrameExpectationsSuite: :param min_rows: The minimum number of rows expected + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -374,6 +489,7 @@ class DataFrameExpectationsSuite: self, column_name: str, substring: str, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a string column contain a specified substring @@ -385,6 +501,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param substring: The substring to search for + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -393,6 +511,7 @@ class DataFrameExpectationsSuite: self, column_name: str, suffix: str, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a string column end with a specified suffix @@ -404,6 +523,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param suffix: The suffix to search for + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -413,6 +534,7 @@ class DataFrameExpectationsSuite: column_name: str, min_length: int, max_length: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the length of the values in a string column is between two specified lengths @@ -425,6 +547,8 @@ class DataFrameExpectationsSuite: :param min_length: The minimum length that the values should be :param max_length: The maximum length that the values should be + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -433,6 +557,7 @@ class DataFrameExpectationsSuite: self, column_name: str, length: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the length of the values in a string column equals a specified length @@ -444,6 +569,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param length: The length that the values should equal + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -452,6 +579,7 @@ class DataFrameExpectationsSuite: self, column_name: str, length: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the length of the values in a string column is greater than a specified length @@ -463,6 +591,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param length: The length that the values should be greater than + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -471,6 +601,7 @@ class DataFrameExpectationsSuite: self, column_name: str, length: int, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the length of the values in a string column is less than a specified length @@ -482,6 +613,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param length: The length that the values should be less than + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -490,6 +623,7 @@ class DataFrameExpectationsSuite: self, column_name: str, substring: str, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a string column do not contain a specified substring @@ -501,6 +635,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param substring: The substring to search for + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -509,6 +645,7 @@ class DataFrameExpectationsSuite: self, column_name: str, prefix: str, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a string column start with a specified prefix @@ -520,6 +657,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param prefix: The prefix to search for + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -527,6 +666,7 @@ class DataFrameExpectationsSuite: def expect_unique_rows( self, column_names: list, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if all rows in the DataFrame are unique based on specified columns @@ -537,6 +677,8 @@ class DataFrameExpectationsSuite: :param column_names: List of column names to check for uniqueness. Empty list checks all columns + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -546,6 +688,7 @@ class DataFrameExpectationsSuite: column_name: str, min_value: Union[int, float], max_value: Union[int, float], + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a column are between two specified values @@ -558,6 +701,8 @@ class DataFrameExpectationsSuite: :param min_value: The minimum value for the range :param max_value: The maximum value for the range + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -566,6 +711,7 @@ class DataFrameExpectationsSuite: self, column_name: str, value: object, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a column equal a specified value @@ -577,6 +723,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param value: The value to compare against + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -585,6 +733,7 @@ class DataFrameExpectationsSuite: self, column_name: str, value: Union[int, float], + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a column are greater than a specified value @@ -596,6 +745,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param value: The value to compare against + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -604,6 +755,7 @@ class DataFrameExpectationsSuite: self, column_name: str, values: list, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a column are in a specified list of values @@ -615,6 +767,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param values: The list of values to compare against + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -623,6 +777,7 @@ class DataFrameExpectationsSuite: self, column_name: str, value: Union[int, float], + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a column are less than a specified value @@ -634,6 +789,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param value: The value to compare against + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -642,6 +799,7 @@ class DataFrameExpectationsSuite: self, column_name: str, value: object, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a column do not equal a specified value @@ -653,6 +811,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param value: The value to compare against + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -661,6 +821,7 @@ class DataFrameExpectationsSuite: self, column_name: str, values: list, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a column are not in a specified list of values @@ -672,6 +833,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check :param values: The list of values to compare against + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -679,6 +842,7 @@ class DataFrameExpectationsSuite: def expect_value_not_null( self, column_name: str, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a column are not null @@ -689,6 +853,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -696,6 +862,7 @@ class DataFrameExpectationsSuite: def expect_value_null( self, column_name: str, + tags: Optional[List[str]] = None, ) -> DataFrameExpectationsSuite: """ Check if the values in a column are null @@ -706,6 +873,8 @@ class DataFrameExpectationsSuite: :param column_name: The name of the column to check + :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]). + :return: An instance of DataFrameExpectationsSuite. """ ... @@ -729,17 +898,25 @@ class DataFrameExpectationsSuite: """ ... - def build(self) -> DataFrameExpectationsSuiteRunner: + def build(self, tags: Optional[List[str]]=None, tag_match_mode: Optional[TagMatchMode]=None) -> DataFrameExpectationsSuiteRunner: """ Build an immutable runner from the current expectations. - The runner contains a snapshot of expectations at the time of building. + This creates a snapshot of the current expectations in the suite. You can continue to add more expectations to this suite and build new runners without affecting previously built runners. + :param tags: Optional tag filters as list of strings in "key:value" format. + Example: ["priority:high", "priority:medium"] + If None or empty, all expectations will be included. + :param tag_match_mode: How to match tags - TagMatchMode.ANY (OR logic) or TagMatchMode.ALL (AND logic). + Required if tags are provided, must be None if tags are not provided. + - TagMatchMode.ANY: Include expectations with ANY of the filter tags + - TagMatchMode.ALL: Include expectations with ALL of the filter tags :return: An immutable DataFrameExpectationsSuiteRunner instance. - :raises ValueError: If no expectations have been added. + :raises ValueError: If no expectations have been added, if tag_match_mode validation fails, + or if no expectations match the tag filters. """ ... diff --git a/docs/source/adding_expectations.rst b/docs/source/adding_expectations.rst index 9755d9f..28fce20 100644 --- a/docs/source/adding_expectations.rst +++ b/docs/source/adding_expectations.rst @@ -64,6 +64,7 @@ Once you have decided where the expectation needs to be added, you can define it def create_expectation_is_divisible(**kwargs) -> DataFrameColumnExpectation: column_name = kwargs["column_name"] value = kwargs["value"] + tags = kwargs.get("tags") return DataFrameColumnExpectation( expectation_name="ExpectIsDivisible", @@ -72,6 +73,7 @@ Once you have decided where the expectation needs to be added, you can define it fn_violations_pyspark=lambda df: df.filter(F.col(column_name) % value != 0), # function that finds violations description=f"'{column_name}' divisible by {value}", error_message=f"'{column_name}' not divisible by {value}.", + tags=tags, ) For additional guidance, you can refer to the implementation of ``ExpectationValueGreaterThan`` and @@ -84,7 +86,6 @@ The ``@register_expectation`` decorator is required and has the following mandat - ``category``: Use ``ExpectationCategory.COLUMN`` or ``ExpectationCategory.AGGREGATION`` - ``subcategory``: Choose from ``ExpectationSubcategory.NUMERICAL``, ``ExpectationSubcategory.STRING``, or ``ExpectationSubcategory.ANY_VALUE`` - ``pydoc``: A brief description of what the expectation does -- ``params``: List of parameter names (e.g., ["column_name", "value"]) - ``params_doc``: Dictionary mapping parameter names to their descriptions - ``param_types``: Dictionary mapping parameter names to their Python types @@ -132,7 +133,7 @@ Here's an example of how to implement an aggregation-based expectation: Expectation that validates the DataFrame has at least a minimum number of rows. """ - def __init__(self, min_count: int): + def __init__(self, min_count: int, tags: Optional[List[str]] = None): description = f"DataFrame has at least {min_count} row(s)" self.min_count = min_count @@ -140,6 +141,7 @@ Here's an example of how to implement an aggregation-based expectation: expectation_name="ExpectationMinRows", column_names=[], # Empty list since this operates on entire DataFrame description=description, + tags=tags, ) def aggregate_and_validate_pandas( @@ -198,7 +200,6 @@ Here's an example of how to implement an aggregation-based expectation: category=ExpectationCategory.AGGREGATION, subcategory=ExpectationSubcategory.ANY_VALUE, pydoc="Expect DataFrame to have at least a minimum number of rows.", - params=["min_count"], params_doc={"min_count": "Minimum required number of rows"}, param_types={"min_count": int} ) @@ -213,7 +214,7 @@ Here's an example of how to implement an aggregation-based expectation: Returns: ExpectationMinRows: A configured expectation instance. """ - return ExpectationMinRows(min_count=kwargs["min_count"]) + return ExpectationMinRows(min_count=kwargs["min_count"], tags=kwargs.get("tags")) Key differences for aggregation-based expectations: @@ -236,7 +237,7 @@ Example of a column-based aggregation expectation: Expectation that validates the mean value of a column falls within a specified range. """ - def __init__(self, column_name: str, min_value: float, max_value: float): + def __init__(self, column_name: str, min_value: float, max_value: float, tags: Optional[List[str]] = None): description = f"column '{column_name}' mean value between {min_value} and {max_value}" self.column_name = column_name @@ -247,6 +248,7 @@ Example of a column-based aggregation expectation: expectation_name="ExpectationColumnMeanBetween", column_names=[column_name], # List of columns this expectation requires description=description, + tags=tags, ) def aggregate_and_validate_pandas( @@ -415,6 +417,13 @@ The method names are automatically derived by: No manual integration is required! Simply register your expectation and it will be available in the suite. +**Note for Expectation Authors:** + +Your expectations automatically support tagging without any additional implementation. The tagging functionality is +handled by the ``DataFrameExpectation`` base class and the suite builder. Users simply pass the ``tags`` parameter +when adding expectations to their suite. See the Getting Started guide for details on how users can leverage tags +for selective execution. + Generating Type Stubs for IDE Support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -424,8 +433,9 @@ To provide IDE autocomplete and type hints for all expect methods, run the stub uv run python scripts/generate_suite_stubs.py -This creates ``suite.pyi`` with type hints for all registered expectations. The stub file is automatically -validated by the sanity check script and pre-commit hooks. +This creates ``suite.pyi`` with type hints for all registered expectations. The stub generator automatically adds +the ``tags`` parameter to all expectation method signatures with appropriate documentation, so you don't need to +include it in your ``params_doc``. The stub file is automatically validated by the sanity check script and pre-commit hooks. Adding Unit Tests ----------------- diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 783a0bc..15782d3 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -168,6 +168,52 @@ When validations fail, you'll see detailed output like this: +-----+------+--------+ ================================================================================ +Tag-Based Filtering for Selective Execution +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can tag expectations and selectively run them based on priority, environment, or custom categories: + +.. code-block:: python + + from dataframe_expectations import DataFrameExpectationsSuite, TagMatchMode + + # Tag expectations with priorities and environments + suite = ( + DataFrameExpectationsSuite() + .expect_value_greater_than(column_name="age", value=18, tags=["priority:high", "env:prod"]) + .expect_value_not_null(column_name="name", tags=["priority:high"]) + .expect_min_rows(min_rows=1, tags=["priority:low", "env:test"]) + ) + + # Run only high-priority checks (OR logic - matches ANY tag) + runner = suite.build(tags=["priority:high"], tag_match_mode=TagMatchMode.ANY) + runner.run(df) + + # Run production-critical checks (AND logic - matches ALL tags) + runner = suite.build(tags=["priority:high", "env:prod"], tag_match_mode=TagMatchMode.ALL) + runner.run(df) + +Programmatic Result Inspection +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Get detailed validation results without raising exceptions: + +.. code-block:: python + + # Get detailed results without raising exceptions + result = runner.run(df, raise_on_failure=False) + + # Inspect validation outcomes + print(f"Total: {result.total_expectations}, Passed: {result.total_passed}, Failed: {result.total_failed}") + print(f"Pass rate: {result.pass_rate:.2%}") + print(f"Duration: {result.total_duration_seconds:.2f}s") + print(f"Applied filters: {result.applied_filters}") + + # Access individual results + for exp_result in result.results: + if exp_result.status == "failed": + print(f"Failed: {exp_result.description} - {exp_result.violation_count} violations") + How to contribute? ------------------ Contributions are welcome! You can enhance the library by adding new expectations, refining existing ones, or improving diff --git a/pyproject.toml b/pyproject.toml index 262667e..8528148 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ dev = [ "numpy>=1.21.0", "pytest>=7.0.0", + "pytest-xdist>=3.0.0", "pre-commit>=2.20.0", "ruff>=0.1.0", "pytest-cov>=4.0.0", diff --git a/scripts/generate_suite_stubs.py b/scripts/generate_suite_stubs.py index b71f2f0..9058ffb 100755 --- a/scripts/generate_suite_stubs.py +++ b/scripts/generate_suite_stubs.py @@ -44,22 +44,26 @@ def transform_suite_to_stub() -> str: # Process imports for node in tree.body: - if isinstance(node, (ast.Import, ast.ImportFrom)): - # Skip logger import - if isinstance(node, ast.ImportFrom) and node.module == 'dataframe_expectations.logging_utils': + match node: + case ast.ImportFrom(module='dataframe_expectations.logging_utils'): + # Skip logger import continue - # Add Union to typing imports if not present - unparsed = ast.unparse(node) - if isinstance(node, ast.ImportFrom) and node.module == 'typing' and 'Union' not in unparsed: - unparsed = unparsed.replace('from typing import ', 'from typing import Union, ') - stub_lines.append(unparsed) + case ast.ImportFrom(module='typing') as import_node: + # Add Union to typing imports if not present + unparsed = ast.unparse(import_node) + if 'Union' not in unparsed: + unparsed = unparsed.replace('from typing import ', 'from typing import Union, ') + stub_lines.append(unparsed) + case ast.Import() | ast.ImportFrom(): + stub_lines.append(ast.unparse(node)) stub_lines.append('') # Empty line after imports # Process classes for node in tree.body: - if isinstance(node, ast.ClassDef): - stub_lines.append(format_class_stub(node)) + match node: + case ast.ClassDef(): + stub_lines.append(format_class_stub(node)) return '\n'.join(stub_lines) @@ -76,23 +80,22 @@ def format_class_stub(class_node: ast.ClassDef) -> str: lines.append(f'class {class_node.name}:') # Class docstring - if (class_node.body and - isinstance(class_node.body[0], ast.Expr) and - isinstance(class_node.body[0].value, ast.Constant) and - isinstance(class_node.body[0].value.value, str)): - docstring = class_node.body[0].value.value - lines.append(f' """{docstring}"""') - body_start = 1 - else: - body_start = 0 + match class_node.body: + # Check for docstring in first statement + case [ast.Expr(value=ast.Constant(value=str() as docstring)), *_]: + lines.append(f' """{docstring}"""') + body_start = 1 + case _: + body_start = 0 # Process methods and properties for item in class_node.body[body_start:]: - if isinstance(item, ast.FunctionDef): - lines.append(format_method_stub(item)) - elif isinstance(item, ast.Assign): - # Keep class-level assignments - lines.append(f' {ast.unparse(item)}') + match item: + case ast.FunctionDef(): + lines.append(format_method_stub(item)) + case ast.Assign(): + # Keep class-level assignments + lines.append(f' {ast.unparse(item)}') lines.append('') # Empty line after class return '\n'.join(lines) @@ -112,17 +115,15 @@ def format_method_stub(func_node: ast.FunctionDef) -> str: lines.append(f' def {func_node.name}({args}){returns}:') # Method docstring - if (func_node.body and - isinstance(func_node.body[0], ast.Expr) and - isinstance(func_node.body[0].value, ast.Constant) and - isinstance(func_node.body[0].value.value, str)): - docstring = func_node.body[0].value.value - # Format docstring with proper indentation - docstring_lines = docstring.split('\n') - lines.append(' """') - for line in docstring_lines: - lines.append(f' {line}' if line.strip() else '') - lines.append(' """') + match func_node.body: + # Check for docstring in first statement + case [ast.Expr(value=ast.Constant(value=str() as docstring)), *_]: + # Format docstring with proper indentation + docstring_lines = docstring.split('\n') + lines.append(' """') + for line in docstring_lines: + lines.append(f' {line}' if line.strip() else '') + lines.append(' """') # Add ... lines.append(' ...') @@ -175,6 +176,9 @@ def generate_stub_method( type_str = format_type_hint(param_type) param_list.append(f"{param}: {type_str}") + # Add tags parameter (always optional) + param_list.append("tags: Optional[List[str]] = None") + params_signature = ",\n ".join(param_list) # Build docstring @@ -198,6 +202,10 @@ def generate_stub_method( docstring_lines.append(f' :param {param}: {param_doc}') docstring_lines.append('') + # Add tags parameter documentation + docstring_lines.append(' :param tags: Optional tags as list of strings in "key:value" format (e.g., ["priority:high", "env:test"]).') + docstring_lines.append('') + # Add return documentation docstring_lines.append(' :return: An instance of DataFrameExpectationsSuite.') docstring_lines.append(' """') diff --git a/scripts/sanity_checks.py b/scripts/sanity_checks.py index ef14493..b228629 100644 --- a/scripts/sanity_checks.py +++ b/scripts/sanity_checks.py @@ -4,17 +4,18 @@ This script validates consistency across the entire expectations framework by checking: 1. All expectations implemented in the expectations/ directory are registered in the registry 2. All registered expectations have corresponding expect_* methods in DataFrameExpectationsSuite -3. All registered expectations have corresponding unit tests in tests/dataframe_expectations/expectations_implemented/ +3. All registered expectations have corresponding unit tests in tests/expectations/ Usage: - python sanity_check_expectations.py + python scripts/sanity_checks.py + python scripts/sanity_checks.py --verbose """ import ast import re import sys from pathlib import Path -from typing import Dict, List, Optional, Set +from typing import Dict, Optional class ExpectationsSanityChecker: @@ -23,106 +24,78 @@ class ExpectationsSanityChecker: def __init__(self, project_root: Path): self.project_root = project_root self.expectations_dir = project_root / "dataframe_expectations" / "expectations" - self.suite_file = project_root / "dataframe_expectations" / "suite.py" self.stub_file = project_root / "dataframe_expectations" / "suite.pyi" self.tests_dir = project_root / "tests" / "expectations" # Results storage self.registered_expectations: Dict[str, str] = {} # expectation_name -> file_path - self.suite_methods: Set[str] = set() # expect_* method names + self.suite_methods: set[str] = set() # expect_* method names self.test_files: Dict[str, str] = {} # expectation_name -> test_file_path - - # Issues tracking - self.issues: List[str] = [] + self.issues: list[str] = [] def run_full_check(self) -> bool: """Run all consistency checks and return True if all pass.""" print("🔍 Starting DataFrame Expectations Framework Sanity Check...") print("=" * 70) - # Step 1: Discover registered expectations - print("\n📋 Step 1: Discovering registered expectations...") - self._discover_registered_expectations() - print(f" Found {len(self.registered_expectations)} registered expectations") - - # Step 2: Discover suite methods - print("\n🎯 Step 2: Discovering suite methods...") - self._discover_suite_methods() - print(f" Found {len(self.suite_methods)} expect_* methods in suite") - - # Step 3: Discover test files - print("\n🧪 Step 3: Discovering test files...") - self._discover_test_files() - print(f" Found {len(self.test_files)} test files") + steps = [ + ("📋 Discovering registered expectations", self._discover_registered_expectations, + lambda: f"Found {len(self.registered_expectations)} registered expectations"), + ("🎯 Discovering suite methods", self._discover_suite_methods, + lambda: f"Found {len(self.suite_methods)} expect_* methods"), + ("🧪 Discovering test files", self._discover_test_files, + lambda: f"Found {len(self.test_files)} test files"), + ("📝 Validating stub file", self._validate_stub_file, None), + ("✅ Validating consistency", self._validate_consistency, None), + ("🏷️ Checking expectation constructors", self._check_expectation_constructor_tags, None), + ] + + for i, (description, func, result_msg) in enumerate(steps, 1): + print(f"\nStep {i}: {description}...") + func() + if result_msg: + print(f" {result_msg()}") - # Step 4: Validate stub file - print("\n📝 Step 4: Validating stub file...") - self._validate_stub_file() + self._print_results() + return len(self.issues) == 0 - # Step 5: Validate consistency - print("\n✅ Step 5: Validating consistency...") + def _validate_consistency(self): + """Run all consistency validation checks.""" self._validate_registry_to_suite_mapping() self._validate_registry_to_tests_mapping() self._validate_orphaned_suite_methods() self._validate_orphaned_test_files() - # Report results - self._print_results() - - return len(self.issues) == 0 - def _discover_registered_expectations(self): """Find all @register_expectation decorators in expectation files.""" - expectation_files = list(self.expectations_dir.rglob("*.py")) - - for file_path in expectation_files: + for file_path in self.expectations_dir.rglob("*.py"): if file_path.name == "__init__.py": continue try: - with open(file_path, "r") as f: - content = f.read() - - # Parse AST to find @register_expectation decorators - tree = ast.parse(content) - + tree = ast.parse(file_path.read_text()) for node in ast.walk(tree): if isinstance(node, ast.FunctionDef): - for decorator in node.decorator_list: - if self._is_register_expectation_decorator(decorator): - expectation_name = self._extract_expectation_name(decorator) - if expectation_name: - self.registered_expectations[expectation_name] = str(file_path) - + expectation_name = self._extract_registered_expectation_name(node) + if expectation_name: + self.registered_expectations[expectation_name] = str(file_path) except Exception as e: - print(f" ⚠️ Warning: Could not parse {file_path}: {e}") - - def _is_register_expectation_decorator(self, decorator) -> bool: - """Check if a decorator is @register_expectation.""" - if isinstance(decorator, ast.Call): - if isinstance(decorator.func, ast.Name) and decorator.func.id == "register_expectation": - return True - return False - - def _extract_expectation_name(self, decorator) -> Optional[str]: - """Extract expectation name from @register_expectation("Name") decorator.""" - if isinstance(decorator, ast.Call) and decorator.args: - first_arg = decorator.args[0] - if isinstance(first_arg, ast.Constant): - return str(first_arg.value) + print(f" ⚠️ Warning: Could not parse {file_path.name}: {e}") + + def _extract_registered_expectation_name(self, func_node: ast.FunctionDef) -> Optional[str]: + """Extract expectation name from @register_expectation decorator if present.""" + for decorator in func_node.decorator_list: + if isinstance(decorator, ast.Call): + if isinstance(decorator.func, ast.Name) and decorator.func.id == "register_expectation": + if decorator.args and isinstance(decorator.args[0], ast.Constant): + return str(decorator.args[0].value) return None def _discover_suite_methods(self): """Find all expect_* methods available via the registry.""" try: - from dataframe_expectations.registry import ( - DataFrameExpectationRegistry, - ) - - # Get the mapping of suite methods from the registry - mapping = DataFrameExpectationRegistry.get_suite_method_mapping() - self.suite_methods = set(mapping.keys()) - + from dataframe_expectations.registry import DataFrameExpectationRegistry + self.suite_methods = set(DataFrameExpectationRegistry.get_suite_method_mapping().keys()) except Exception as e: self.issues.append(f"❌ Could not load suite methods from registry: {e}") @@ -132,26 +105,18 @@ def _discover_test_files(self): self.issues.append(f"❌ Tests directory not found: {self.tests_dir}") return - test_files = list(self.tests_dir.rglob("test_*.py")) - - for test_file in test_files: - # Skip template files + for test_file in self.tests_dir.rglob("test_*.py"): if "template" in test_file.name.lower(): continue - # Extract potential expectation name from filename - # e.g., test_expect_value_equals.py -> ExpectationValueEquals - filename = test_file.stem - if filename.startswith("test_expect_"): - # Convert test_expect_value_equals -> ValueEquals - expectation_part = filename[12:] # Remove "test_expect_" - expectation_name = "Expectation" + self._snake_to_pascal_case(expectation_part) + # Convert test_expect_value_equals.py -> ExpectationValueEquals + if test_file.stem.startswith("test_expect_"): + expectation_part = test_file.stem[12:] # Remove "test_expect_" + expectation_name = f"Expectation{self._snake_to_pascal_case(expectation_part)}" self.test_files[expectation_name] = str(test_file) def _validate_stub_file(self): """Check if the stub file is up-to-date with registered expectations.""" - print(" 📝 Checking if stub file is up-to-date...") - if not self.stub_file.exists(): self.issues.append( f"❌ Stub file not found: {self.stub_file}\n" @@ -162,12 +127,8 @@ def _validate_stub_file(self): try: from generate_suite_stubs import generate_pyi_file - # Generate what the content should be expected_content = generate_pyi_file() - - # Read the current stub file - with open(self.stub_file, "r") as f: - actual_content = f.read() + actual_content = self.stub_file.read_text() if expected_content != actual_content: self.issues.append( @@ -177,102 +138,147 @@ def _validate_stub_file(self): except Exception as e: self.issues.append(f"❌ Could not validate stub file: {e}") - def _snake_to_pascal_case(self, snake_str: str) -> str: + @staticmethod + def _snake_to_pascal_case(snake_str: str) -> str: """Convert snake_case to PascalCase.""" - components = snake_str.split("_") - return "".join(word.capitalize() for word in components) + return "".join(word.capitalize() for word in snake_str.split("_")) def _validate_registry_to_suite_mapping(self): """Check that all registered expectations have suite methods.""" - print(" 🔗 Checking registry -> suite mapping...") - - missing_suite_methods = [] - - for expectation_name in self.registered_expectations.keys(): - # Convert expectation name to expected suite method name - expected_method = self._expectation_to_suite_method(expectation_name) - - if expected_method not in self.suite_methods: - missing_suite_methods.append((expectation_name, expected_method)) + missing = [ + (name, self._expectation_to_suite_method(name)) + for name in self.registered_expectations + if self._expectation_to_suite_method(name) not in self.suite_methods + ] - if missing_suite_methods: + if missing: self.issues.append("❌ Registered expectations missing suite methods:") - for exp_name, method_name in missing_suite_methods: - self.issues.append(f" • {exp_name} -> missing {method_name}()") + self.issues.extend(f" • {name} -> missing {method}()" for name, method in missing) def _validate_registry_to_tests_mapping(self): """Check that all registered expectations have test files.""" - print(" 🧪 Checking registry -> tests mapping...") + missing = [name for name in self.registered_expectations if name not in self.test_files] - missing_tests = [] - - for expectation_name in self.registered_expectations.keys(): - if expectation_name not in self.test_files: - missing_tests.append(expectation_name) - - if missing_tests: + if missing: self.issues.append("❌ Registered expectations missing test files:") - for exp_name in missing_tests: - expected_test_file = self._expectation_to_test_filename(exp_name) - self.issues.append(f" • {exp_name} -> missing {expected_test_file}") + self.issues.extend( + f" • {name} -> missing {self._expectation_to_test_filename(name)}" + for name in missing + ) def _validate_orphaned_suite_methods(self): """Check for suite methods without corresponding registered expectations.""" - print(" 🔍 Checking for orphaned suite methods...") - - orphaned_methods = [] + orphaned = [ + (method, self._suite_method_to_expectation(method)) + for method in self.suite_methods + if self._suite_method_to_expectation(method) not in self.registered_expectations + ] - for method_name in self.suite_methods: - expected_expectation = self._suite_method_to_expectation(method_name) - - if expected_expectation not in self.registered_expectations: - orphaned_methods.append((method_name, expected_expectation)) - - if orphaned_methods: + if orphaned: self.issues.append("❌ Suite methods without registered expectations:") - for method_name, exp_name in orphaned_methods: - self.issues.append(f" • {method_name}() -> missing {exp_name}") + self.issues.extend(f" • {method}() -> missing {exp}" for method, exp in orphaned) def _validate_orphaned_test_files(self): """Check for test files without corresponding registered expectations.""" - print(" 🧪 Checking for orphaned test files...") - - orphaned_tests = [] - - for expectation_name, test_file in self.test_files.items(): - if expectation_name not in self.registered_expectations: - orphaned_tests.append((expectation_name, test_file)) + orphaned = [ + (name, path) + for name, path in self.test_files.items() + if name not in self.registered_expectations + ] - if orphaned_tests: + if orphaned: self.issues.append("❌ Test files without registered expectations:") - for exp_name, test_file in orphaned_tests: - self.issues.append(f" • {test_file} -> missing {exp_name}") - - def _expectation_to_suite_method(self, expectation_name: str) -> str: - """Convert expectation name to expected suite method name.""" - # Remove "Expectation" prefix if present - if expectation_name.startswith("Expectation"): - name_part = expectation_name[11:] # Remove "Expectation" - else: - name_part = expectation_name + self.issues.extend(f" • {path} -> missing {name}" for name, path in orphaned) - # Convert PascalCase to snake_case and add "expect_" prefix + @staticmethod + def _expectation_to_suite_method(expectation_name: str) -> str: + """Convert ExpectationFooBar to expect_foo_bar.""" + name_part = expectation_name.removeprefix("Expectation") snake_case = re.sub("([A-Z])", r"_\1", name_part).lower().lstrip("_") return f"expect_{snake_case}" def _suite_method_to_expectation(self, method_name: str) -> str: - """Convert suite method name to expected expectation name.""" + """Convert expect_foo_bar to ExpectationFooBar.""" if method_name.startswith("expect_"): - name_part = method_name[7:] # Remove "expect_" - # Convert snake_case to PascalCase and add "Expectation" prefix - pascal_case = self._snake_to_pascal_case(name_part) - return f"Expectation{pascal_case}" + name_part = method_name[7:] + return f"Expectation{self._snake_to_pascal_case(name_part)}" return method_name def _expectation_to_test_filename(self, expectation_name: str) -> str: - """Convert expectation name to expected test filename.""" - method_name = self._expectation_to_suite_method(expectation_name) - return f"test_{method_name}.py" + """Convert expectation name to test_expect_foo_bar.py.""" + return f"test_{self._expectation_to_suite_method(expectation_name)}.py" + + def _check_expectation_constructor_tags(self): + """Check that all DataFrameExpectation subclasses accept 'tags' and pass to super().__init__.""" + for file_path in self.expectations_dir.rglob("*.py"): + if file_path.name == "__init__.py": + continue + + try: + tree = ast.parse(file_path.read_text()) + for node in ast.walk(tree): + if not isinstance(node, ast.ClassDef) or not self._is_expectation_class(node): + continue + + init_method = self._find_init_method(node) + if not init_method: + continue + + if not self._has_tags_param(init_method): + self.issues.append( + f"❌ {node.name} in {file_path.name} missing 'tags' param in __init__" + ) + + if not self._has_super_init_with_tags(init_method): + self.issues.append( + f"❌ {node.name} in {file_path.name} missing super().__init__(tags=tags)" + ) + except Exception as e: + print(f" ⚠️ Warning: Could not parse {file_path.name}: {e}") + + @staticmethod + def _is_expectation_class(node: ast.ClassDef) -> bool: + """Check if a class inherits from DataFrameExpectation (direct or indirect).""" + return any( + isinstance(base, ast.Name) and (base.id == "DataFrameExpectation" or base.id.endswith("Expectation")) + for base in node.bases + ) + + @staticmethod + def _find_init_method(class_node: ast.ClassDef) -> Optional[ast.FunctionDef]: + """Find the __init__ method in a class.""" + return next( + (item for item in class_node.body if isinstance(item, ast.FunctionDef) and item.name == "__init__"), + None + ) + + @staticmethod + def _has_tags_param(init_method: ast.FunctionDef) -> bool: + """Check if __init__ has 'tags' parameter with correct type annotation.""" + valid_annotations = { + "Optional[List[str]]", "List[str] | None", "Union[List[str], None]", + "Optional[list[str]]", "list[str] | None" + } + + for arg in init_method.args.args: + if arg.arg == "tags": + if not arg.annotation: + return True + return ast.unparse(arg.annotation) in valid_annotations + return False + + @staticmethod + def _has_super_init_with_tags(init_method: ast.FunctionDef) -> bool: + """Check if __init__ calls super().__init__(tags=tags).""" + for stmt in ast.walk(init_method): + if (isinstance(stmt, ast.Call) and + isinstance(stmt.func, ast.Attribute) and stmt.func.attr == "__init__" and + isinstance(stmt.func.value, ast.Call) and + isinstance(stmt.func.value.func, ast.Name) and stmt.func.value.func.id == "super"): + + if any(kw.arg == "tags" for kw in stmt.keywords): + return True + return False def _print_results(self): """Print the final results of the sanity check.""" @@ -282,15 +288,14 @@ def _print_results(self): print("\n📈 Summary:") print(f" • Registered expectations: {len(self.registered_expectations)}") - print(f" • Suite methods: {len(self.suite_methods)}") - print(f" • Test files: {len(self.test_files)}") - print(f" • Issues found: {len(self.issues)}") + print(f" • Suite methods: {len(self.suite_methods)}") + print(f" • Test files: {len(self.test_files)}") + print(f" • Issues found: {len(self.issues)}") if self.issues: print(f"\n❌ ISSUES FOUND ({len(self.issues)}):") print("-" * 40) - for issue in self.issues: - print(issue) + print("\n".join(self.issues)) else: print("\n✅ ALL CHECKS PASSED!") print(" The expectations framework is consistent across:") @@ -305,123 +310,47 @@ def print_detailed_mappings(self): print("\n🔍 DETAILED MAPPINGS") print("=" * 50) - print(f"\n📋 Registered Expectations ({len(self.registered_expectations)}):") - for name, file_path in sorted(self.registered_expectations.items()): - print(f" • {name} ({Path(file_path).name})") - - print(f"\n🎯 Suite Methods ({len(self.suite_methods)}):") - for method in sorted(self.suite_methods): - print(f" • {method}()") - - print(f"\n🧪 Test Files ({len(self.test_files)}):") - for name, file_path in sorted(self.test_files.items()): - print(f" • {name} -> {Path(file_path).name}") - - def should_run_check(self) -> bool: - """Check if we should run based on changed files in the current branch.""" - import subprocess + mappings = [ + (f"📋 Registered Expectations ({len(self.registered_expectations)})", + sorted((f"{name} ({Path(path).name})", name, path) for name, path in self.registered_expectations.items())), + (f"🎯 Suite Methods ({len(self.suite_methods)})", + sorted((f"{method}()", method, None) for method in self.suite_methods)), + (f"🧪 Test Files ({len(self.test_files)})", + sorted((f"{name} -> {Path(path).name}", name, path) for name, path in self.test_files.items())), + ] - try: - # Try to get the default branch name (usually main or master) - try: - result = subprocess.run( - ["git", "symbolic-ref", "refs/remotes/origin/HEAD"], - capture_output=True, - text=True, - check=True, - ) - default_branch = result.stdout.strip().split("/")[-1] - except subprocess.CalledProcessError: - # Fallback to common default branch names - for branch in ["main", "master"]: - try: - subprocess.run( - ["git", "rev-parse", f"origin/{branch}"], - capture_output=True, - text=True, - check=True, - ) - default_branch = branch - break - except subprocess.CalledProcessError: - continue - else: - default_branch = "main" # Final fallback - - # Get list of changed files compared to default branch - result = subprocess.run( - ["git", "diff", f"origin/{default_branch}...HEAD", "--name-only"], - capture_output=True, - text=True, - check=True, - ) - changed_files = [f for f in result.stdout.strip().split("\n") if f] - - if not changed_files: - print("🔍 No files changed, skipping sanity check.") - return False - - # Check if any relevant files changed - relevant_patterns = [ - "dataframe_expectations/", - "tests/dataframe_expectations/", - ] - - changed_relevant_files = [] - for file in changed_files: - for pattern in relevant_patterns: - if pattern in file: - changed_relevant_files.append(file) - break - - if changed_relevant_files: - print("🔍 Relevant DataFrame expectations files changed:") - for file in changed_relevant_files: - print(f" • {file}") - return True - else: - print("🔍 No relevant DataFrame expectations files changed, skipping sanity check.") - return False - - except subprocess.CalledProcessError as e: - print(f"⚠️ Git command failed: {e}") - print("🔍 Running sanity check anyway as a safety measure.") - return True - except Exception as e: - print(f"⚠️ Error checking changed files: {e}") - print("🔍 Running sanity check anyway as a safety measure.") - return True + for title, items in mappings: + print(f"\n{title}:") + for display, *_ in items: + print(f" • {display}") -if __name__ == "__main__": - # Use relative path from the script location - script_dir = Path(__file__).parent - # Go up one level: sanity_checks.py is in scripts/, project root is parent - project_root = script_dir.parent +def main(): + """Main entry point for the sanity check script.""" + project_root = Path(__file__).parent.parent - # Add project root to sys.path to enable imports - # This must be done before creating the checker since imports happen during initialization + # Add project root to sys.path for imports if str(project_root) not in sys.path: sys.path.insert(0, str(project_root)) # Validate directory structure - expected_dirs = ["dataframe_expectations", "tests", "pyproject.toml"] - missing_dirs = [d for d in expected_dirs if not (project_root / d).exists()] + expected_paths = ["dataframe_expectations", "tests", "pyproject.toml"] + missing = [p for p in expected_paths if not (project_root / p).exists()] - if missing_dirs: - print(f"❌ Missing expected directories/files: {missing_dirs}") - print(f"Script location: {Path(__file__)}") - print(f"Project root: {project_root}") + if missing: + print(f"❌ Missing expected directories/files: {missing}") + print(f" Script location: {Path(__file__)}") + print(f" Project root: {project_root}") sys.exit(1) checker = ExpectationsSanityChecker(project_root) - - # Run the checks success = checker.run_full_check() - # Optionally print detailed mappings for debugging if "--verbose" in sys.argv or "-v" in sys.argv: checker.print_detailed_mappings() - # Exit with appropriate code sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/tests/base/test_suite.py b/tests/base/test_suite.py index 6d0a148..e4a9d01 100644 --- a/tests/base/test_suite.py +++ b/tests/base/test_suite.py @@ -9,6 +9,7 @@ from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, ) +from dataframe_expectations.core.suite_result import SuiteExecutionResult def test_suite_success(): @@ -25,7 +26,11 @@ def test_suite_success(): data_Frame = pd.DataFrame({"col1": [3, 4, 5]}) runner = suite.build() result = runner.run(data_frame=data_Frame) - assert result is None, "Expected no result for successful suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 2, "Expected 2 passed expectations" + assert result.total_failed == 0, "Expected 0 failed expectations" def test_suite_failure(): @@ -74,12 +79,18 @@ def test_suite_with_supported_dataframe_types(spark): # Test with pandas DataFrame pandas_df = pd.DataFrame({"col1": [1, 2, 3]}) result = runner.run(data_frame=pandas_df) - assert result is None, "Expected success for pandas DataFrame" + assert result is not None, "Expected SuiteExecutionResult for pandas DataFrame" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected success for pandas DataFrame" + assert result.dataframe_type == DataFrameType.PANDAS # Test with PySpark DataFrame spark_df = spark.createDataFrame([(1,), (2,), (3,)], ["col1"]) result = runner.run(data_frame=spark_df) - assert result is None, "Expected success for PySpark DataFrame" + assert result is not None, "Expected SuiteExecutionResult for PySpark DataFrame" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected success for PySpark DataFrame" + assert result.dataframe_type == DataFrameType.PYSPARK def test_suite_with_unsupported_dataframe_types(): @@ -126,6 +137,9 @@ def unpersist(self): self.is_cached = False return self + def count(self): + return 0 + suite = DataFrameExpectationsSuite().expect_min_rows(min_rows=0) runner = suite.build() @@ -147,7 +161,9 @@ def unpersist(self): mock_connect_df = MockConnectDataFrame() result = runner.run(data_frame=mock_connect_df) - assert result is None, "Expected success for mock Connect DataFrame" + assert result is not None, "Expected SuiteExecutionResult for mock Connect DataFrame" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected success for mock Connect DataFrame" def test_expectation_suite_failure_message(): @@ -208,8 +224,8 @@ def test_builder_pattern_immutability(): runner1 = suite.build() # Verify runner1 has exactly 1 expectation - assert runner1.expectation_count == 1, "Runner1 should have 1 expectation" - expectations_list = runner1.list_expectations() + assert runner1.total_expectations == 1, "Runner1 should have 1 expectation" + expectations_list = runner1.list_all_expectations() assert len(expectations_list) == 1 assert expectations_list[0] == "ExpectationValueGreaterThan ('col1' is greater than 5)" @@ -220,9 +236,9 @@ def test_builder_pattern_immutability(): runner2 = suite.build() # Verify runner2 has 2 expectations but runner1 is unchanged - assert runner1.expectation_count == 1, "Runner1 should still have 1 expectation (immutable)" - assert runner2.expectation_count == 2, "Runner2 should have 2 expectations" - expectations_list2 = runner2.list_expectations() + assert runner1.total_expectations == 1, "Runner1 should still have 1 expectation (immutable)" + assert runner2.total_expectations == 2, "Runner2 should have 2 expectations" + expectations_list2 = runner2.list_all_expectations() assert len(expectations_list2) == 2 assert expectations_list2[0] == "ExpectationValueGreaterThan ('col1' is greater than 5)" assert expectations_list2[1] == "ExpectationValueLessThan ('col1' is less than 20)" @@ -232,11 +248,17 @@ def test_builder_pattern_immutability(): # Runner1 should only have 1 expectation (passes) result1 = runner1.run(data_frame=df) - assert result1 is None, "Runner1 should pass with only 1 expectation" + assert result1 is not None, "Runner1 should return SuiteExecutionResult" + assert isinstance(result1, SuiteExecutionResult), "Result1 should be SuiteExecutionResult" + assert result1.success, "Runner1 should pass with only 1 expectation" + assert result1.total_passed == 1 # Runner2 should have 2 expectations (passes) result2 = runner2.run(data_frame=df) - assert result2 is None, "Runner2 should pass with 2 expectations" + assert result2 is not None, "Runner2 should return SuiteExecutionResult" + assert isinstance(result2, SuiteExecutionResult), "Result2 should be SuiteExecutionResult" + assert result2.success, "Runner2 should pass with 2 expectations" + assert result2.total_passed == 2 def test_decorator_success(): diff --git a/tests/base/test_suite_result.py b/tests/base/test_suite_result.py new file mode 100644 index 0000000..b0400c9 --- /dev/null +++ b/tests/base/test_suite_result.py @@ -0,0 +1,406 @@ +"""Comprehensive unit tests for SuiteExecutionResult and ExpectationResult.""" + +import pytest +from datetime import datetime, timedelta +from dataframe_expectations.core.suite_result import ( + ExpectationResult, + ExpectationStatus, + SuiteExecutionResult, + serialize_violations, +) +from dataframe_expectations.core.types import DataFrameType +from dataframe_expectations.core.tagging import TagSet +import pandas as pd + + +@pytest.mark.parametrize( + "status, error_message, violation_count, violation_sample", + [ + (ExpectationStatus.PASSED, None, None, None), + (ExpectationStatus.FAILED, "Validation failed", 10, [{"col": "value"}]), + (ExpectationStatus.SKIPPED, None, None, None), + ], +) +def test_expectation_result_creation(status, error_message, violation_count, violation_sample): + """Test creating ExpectationResult with various statuses.""" + result = ExpectationResult( + expectation_name="TestExpectation", + description="Test description", + status=status, + tags=TagSet(["priority:high"]), + error_message=error_message, + violation_count=violation_count, + violation_sample=violation_sample, + ) + + assert result.expectation_name == "TestExpectation" + assert result.description == "Test description" + assert result.status == status + assert result.error_message == error_message + assert result.violation_count == violation_count + assert result.violation_sample == violation_sample + + +def test_expectation_result_immutability(): + """Test that ExpectationResult is immutable.""" + result = ExpectationResult( + expectation_name="TestExpectation", + description="Test description", + status=ExpectationStatus.PASSED, + ) + + with pytest.raises(Exception): # Pydantic raises ValidationError or AttributeError + result.status = ExpectationStatus.FAILED # type: ignore + + +def test_expectation_result_with_tags(): + """Test ExpectationResult with tags.""" + tags = TagSet(["priority:high", "env:test"]) + result = ExpectationResult( + expectation_name="TestExpectation", + description="Test description", + status=ExpectationStatus.PASSED, + tags=tags, + ) + + assert result.tags is not None + assert len(result.tags) == 2 + + +def test_expectation_result_without_tags(): + """Test ExpectationResult without tags.""" + result = ExpectationResult( + expectation_name="TestExpectation", + description="Test description", + status=ExpectationStatus.PASSED, + ) + + assert result.tags is None + + +def test_suite_result_basic_creation(): + """Test creating a basic SuiteExecutionResult.""" + start = datetime.now() + end = start + timedelta(seconds=5) + + result = SuiteExecutionResult( + suite_name="TestSuite", + context={"job_id": "123"}, + applied_filters=TagSet(["priority:high"]), + tag_match_mode="any", + results=[], + start_time=start, + end_time=end, + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + assert result.suite_name == "TestSuite" + assert result.context == {"job_id": "123"} + assert len(result.applied_filters) == 1 + assert result.tag_match_mode == "any" + assert result.dataframe_type == DataFrameType.PANDAS + assert result.dataframe_row_count == 100 + + +def test_suite_result_immutability(): + """Test that SuiteExecutionResult is immutable.""" + start = datetime.now() + end = start + timedelta(seconds=5) + + result = SuiteExecutionResult( + results=[], + start_time=start, + end_time=end, + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + with pytest.raises(Exception): + result.suite_name = "NewName" # type: ignore + + +@pytest.fixture +def sample_results(): + """Create sample expectation results.""" + return [ + ExpectationResult( + expectation_name="Exp1", + description="Desc1", + status=ExpectationStatus.PASSED, + ), + ExpectationResult( + expectation_name="Exp2", + description="Desc2", + status=ExpectationStatus.PASSED, + ), + ExpectationResult( + expectation_name="Exp3", + description="Desc3", + status=ExpectationStatus.FAILED, + error_message="Failed", + violation_count=5, + ), + ExpectationResult( + expectation_name="Exp4", + description="Desc4", + status=ExpectationStatus.SKIPPED, + ), + ] + + +def test_total_duration_seconds(): + """Test total_duration_seconds computation.""" + start = datetime(2024, 1, 1, 12, 0, 0) + end = datetime(2024, 1, 1, 12, 0, 10) + + result = SuiteExecutionResult( + results=[], + start_time=start, + end_time=end, + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + assert result.total_duration_seconds == 10.0 + + +def test_total_expectations(sample_results): + """Test total_expectations computation.""" + result = SuiteExecutionResult( + results=sample_results, + start_time=datetime.now(), + end_time=datetime.now(), + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + assert result.total_expectations == 4 + + +def test_total_counts(sample_results): + """Test total_passed, total_failed, and total_skipped computation.""" + result = SuiteExecutionResult( + results=sample_results, + start_time=datetime.now(), + end_time=datetime.now(), + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + assert result.total_passed == 2 + assert result.total_failed == 1 + assert result.total_skipped == 1 + + +@pytest.mark.parametrize( + "passed, failed, expected_rate", + [ + (3, 0, 1.0), + (2, 1, 2 / 3), + (1, 2, 1 / 3), + (0, 3, 0.0), + (0, 0, 1.0), # Edge case: no executed expectations + ], +) +def test_pass_rate(passed, failed, expected_rate): + """Test pass_rate computation with various scenarios.""" + statuses = [ExpectationStatus.PASSED] * passed + [ExpectationStatus.FAILED] * failed + results = [] + for i, status in enumerate(statuses): + results.append( + ExpectationResult( + expectation_name=f"Exp{i}", + description="Desc", + status=status, + error_message="Failed" if status == ExpectationStatus.FAILED else None, + ) + ) + + result = SuiteExecutionResult( + results=results, + start_time=datetime.now(), + end_time=datetime.now(), + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + assert result.pass_rate == pytest.approx(expected_rate) + + +@pytest.mark.parametrize( + "passed, failed, skipped, expected_success", + [ + (3, 0, 0, True), + (3, 0, 2, True), # Skipped doesn't affect success + (2, 1, 0, False), + (0, 0, 3, True), # All skipped = success + ], +) +def test_success(passed, failed, skipped, expected_success): + """Test success computation.""" + statuses = ( + [ExpectationStatus.PASSED] * passed + + [ExpectationStatus.FAILED] * failed + + [ExpectationStatus.SKIPPED] * skipped + ) + results = [] + for i, status in enumerate(statuses): + results.append( + ExpectationResult( + expectation_name=f"Exp{i}", + description="Desc", + status=status, + error_message="Failed" if status == ExpectationStatus.FAILED else None, + ) + ) + + result = SuiteExecutionResult( + results=results, + start_time=datetime.now(), + end_time=datetime.now(), + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + assert result.success == expected_success + + +def test_expectation_lists(sample_results): + """Test passed_expectations, failed_expectations, skipped_expectations.""" + result = SuiteExecutionResult( + results=sample_results, + start_time=datetime.now(), + end_time=datetime.now(), + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + assert len(result.passed_expectations) == 2 + assert len(result.failed_expectations) == 1 + assert len(result.skipped_expectations) == 1 + + assert all(r.status == ExpectationStatus.PASSED for r in result.passed_expectations) + assert all(r.status == ExpectationStatus.FAILED for r in result.failed_expectations) + assert all(r.status == ExpectationStatus.SKIPPED for r in result.skipped_expectations) + + +def test_serialize_pandas_violations(): + """Test serializing pandas DataFrame violations.""" + violations_df = pd.DataFrame({"col1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + + count, sample = serialize_violations(violations_df, DataFrameType.PANDAS, limit=5) + + assert count == 10 + assert sample is not None + assert len(sample) == 5 + assert sample[0] == {"col1": 1} + + +def test_serialize_pyspark_violations(spark): + """Test serializing PySpark DataFrame violations.""" + violations_df = spark.createDataFrame([(i,) for i in range(1, 11)], ["col1"]) + + count, sample = serialize_violations(violations_df, DataFrameType.PYSPARK, limit=5) + + assert count == 10 + assert sample is not None + assert len(sample) == 5 + assert sample[0] == {"col1": 1} + + +def test_serialize_none_violations(): + """Test serializing None violations.""" + count, sample = serialize_violations(None, DataFrameType.PANDAS, limit=5) + + assert count is None + assert sample is None + + +@pytest.mark.parametrize( + "limit, expected_len", + [ + (1, 1), + (3, 3), + (10, 10), + (100, 10), # More than available + ], +) +def test_serialize_with_different_limits(limit, expected_len): + """Test serializing with different limit values.""" + violations_df = pd.DataFrame({"col1": list(range(10))}) + + count, sample = serialize_violations(violations_df, DataFrameType.PANDAS, limit=limit) + + assert count == 10 + assert sample is not None + assert len(sample) == min(expected_len, 10) + + +def test_suite_result_with_tag_filters(): + """Test SuiteExecutionResult with tag filters applied.""" + results = [ + ExpectationResult( + expectation_name="Exp1", + description="Desc1", + status=ExpectationStatus.PASSED, + tags=TagSet(["priority:high"]), + ), + ExpectationResult( + expectation_name="Exp2", + description="Desc2", + status=ExpectationStatus.SKIPPED, + tags=TagSet(["priority:low"]), + ), + ] + + result = SuiteExecutionResult( + suite_name="FilteredSuite", + applied_filters=TagSet(["priority:high"]), + tag_match_mode="any", + results=results, + start_time=datetime.now(), + end_time=datetime.now(), + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + assert result.total_expectations == 2 + assert result.total_passed == 1 + assert result.total_skipped == 1 + assert len(result.applied_filters) == 1 + + +@pytest.mark.parametrize( + "tag_match_mode", + ["any", "all", None], +) +def test_suite_result_tag_match_modes(tag_match_mode): + """Test SuiteExecutionResult with different tag match modes.""" + filters = TagSet(["priority:high"]) if tag_match_mode else TagSet() + + result = SuiteExecutionResult( + applied_filters=filters, + tag_match_mode=tag_match_mode, + results=[], + start_time=datetime.now(), + end_time=datetime.now(), + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) + + assert result.tag_match_mode == tag_match_mode + + +def test_suite_result_invalid_tag_match_mode(): + """Test that invalid tag_match_mode raises validation error.""" + with pytest.raises(Exception): # Pydantic raises ValidationError + SuiteExecutionResult( + applied_filters=TagSet(["priority:high"]), + tag_match_mode="invalid", # type: ignore + results=[], + start_time=datetime.now(), + end_time=datetime.now(), + dataframe_type=DataFrameType.PANDAS, + dataframe_row_count=100, + ) diff --git a/tests/base/test_suite_with_tagging.py b/tests/base/test_suite_with_tagging.py new file mode 100644 index 0000000..5c7c907 --- /dev/null +++ b/tests/base/test_suite_with_tagging.py @@ -0,0 +1,340 @@ +"""Comprehensive unit tests for suite.py with tagging and result handling.""" + +import pytest +import pandas as pd +from dataframe_expectations.suite import ( + DataFrameExpectationsSuite, + DataFrameExpectationsSuiteFailure, +) +from dataframe_expectations.core.suite_result import ( + SuiteExecutionResult, + ExpectationStatus, +) +from dataframe_expectations.core.types import DataFrameType + + +def test_build_suite_no_tags(): + """Test building suite without tag filters.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=5) + suite.expect_value_less_than(column_name="col1", value=20) + + runner = suite.build() + + assert runner.total_expectations == 2 + assert runner.selected_expectations_count == 2 + + +def test_build_suite_with_any_tag_filter(): + """Test building suite with 'any' tag filter.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=5, tags=["priority:high"]) + suite.expect_value_less_than(column_name="col1", value=20, tags=["priority:medium"]) + suite.expect_min_rows(min_rows=1, tags=["priority:low"]) + + # Filter for high or medium priority + runner = suite.build(tags=["priority:high", "priority:medium"], tag_match_mode="any") + + assert runner.total_expectations == 3 + assert runner.selected_expectations_count == 2 + + +def test_build_suite_with_all_tag_filter(): + """Test building suite with 'all' tag filter.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=5, tags=["priority:high", "env:prod"]) + suite.expect_value_less_than(column_name="col1", value=20, tags=["priority:high", "env:test"]) + suite.expect_min_rows(min_rows=1, tags=["priority:low"]) + + # Filter for high priority AND prod environment + runner = suite.build(tags=["priority:high", "env:prod"], tag_match_mode="all") + + assert runner.total_expectations == 3 + assert runner.selected_expectations_count == 1 + + +def test_build_raises_error_tags_without_mode(): + """Test that building with tags but no mode raises ValueError.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=5, tags=["priority:high"]) + + with pytest.raises(ValueError, match="tag_match_mode must be specified"): + suite.build(tags=["priority:high"]) + + +def test_build_raises_error_mode_without_tags(): + """Test that building with mode but no tags raises ValueError.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=5) + + with pytest.raises(ValueError, match="tag_match_mode cannot be provided"): + suite.build(tag_match_mode="any") + + +def test_build_raises_error_all_filtered_out(): + """Test that filtering out all expectations raises ValueError.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=5, tags=["priority:low"]) + suite.expect_value_less_than(column_name="col1", value=20, tags=["priority:low"]) + + with pytest.raises(ValueError, match="resulted in zero expectations"): + suite.build(tags=["priority:high"], tag_match_mode="any") + + +@pytest.mark.parametrize( + "filter_tags, match_mode, expected_executed, expected_skipped", + [ + (None, None, 3, 0), # No filter + (["priority:high"], "any", 1, 2), # Only high + (["priority:medium"], "any", 1, 2), # Only medium + (["priority:high", "priority:medium"], "any", 2, 1), # High or medium + (["priority:high", "env:prod"], "all", 1, 2), # High AND prod + (["priority:high", "env:test"], "all", 0, 3), # High AND test (none match) + ], +) +def test_runner_tag_filtering(filter_tags, match_mode, expected_executed, expected_skipped): + """Test runner with various tag filtering configurations.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=0, tags=["priority:high", "env:prod"]) + suite.expect_value_less_than( + column_name="col1", value=100, tags=["priority:medium", "env:prod"] + ) + suite.expect_min_rows(min_rows=1, tags=["priority:low"]) + + df = pd.DataFrame({"col1": [10, 20, 30]}) + + if filter_tags is None: + runner = suite.build() + else: + try: + runner = suite.build(tags=filter_tags, tag_match_mode=match_mode) + except ValueError: + # All filtered out + assert expected_executed == 0 + return + + result = runner.run(data_frame=df, raise_on_failure=False) + + assert result is not None + assert result.total_passed == expected_executed + assert result.total_skipped == expected_skipped + + +def test_runner_lists_all_vs_selected_expectations(): + """Test list_all_expectations vs list_selected_expectations.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=0, tags=["priority:high"]) + suite.expect_value_less_than(column_name="col1", value=100, tags=["priority:medium"]) + suite.expect_min_rows(min_rows=1, tags=["priority:low"]) + + runner = suite.build(tags=["priority:high", "priority:medium"], tag_match_mode="any") + + all_expectations = runner.list_all_expectations() + selected_expectations = runner.list_selected_expectations() + + assert len(all_expectations) == 3 + assert len(selected_expectations) == 2 + + +@pytest.mark.parametrize( + "passing, failing, raise_on_failure, should_raise", + [ + (3, 0, True, False), + (3, 0, False, False), + (2, 1, True, True), + (2, 1, False, False), + (0, 3, True, True), + (0, 3, False, False), + ], +) +def test_raise_on_failure_parametrized(passing, failing, raise_on_failure, should_raise): + """Test raise_on_failure with various passing/failing combinations.""" + suite = DataFrameExpectationsSuite() + + # Add passing expectations + for i in range(passing): + suite.expect_value_greater_than(column_name="col1", value=0) + + # Add failing expectations + for i in range(failing): + suite.expect_value_greater_than(column_name="col1", value=100) + + df = pd.DataFrame({"col1": [10, 20, 30]}) + runner = suite.build() + + if should_raise: + with pytest.raises(DataFrameExpectationsSuiteFailure) as exc_info: + runner.run(data_frame=df, raise_on_failure=raise_on_failure) + + exception = exc_info.value + assert len(exception.failures) == failing + assert exception.result is not None + assert isinstance(exception.result, SuiteExecutionResult) + assert exception.result.total_passed == passing + assert exception.result.total_failed == failing + else: + result = runner.run(data_frame=df, raise_on_failure=raise_on_failure) + assert result is not None + assert isinstance(result, SuiteExecutionResult) + assert result.total_passed == passing + assert result.total_failed == failing + + +def test_result_contains_execution_metadata(): + """Test that result contains execution metadata.""" + suite = DataFrameExpectationsSuite(suite_name="TestSuite") + suite.expect_min_rows(min_rows=1) + + df = pd.DataFrame({"col1": [1, 2, 3]}) + runner = suite.build() + + result = runner.run(data_frame=df, context={"job_id": "123"}) + + assert result is not None + assert isinstance(result, SuiteExecutionResult) + assert result.suite_name == "TestSuite" + assert result.context == {"job_id": "123"} + assert result.dataframe_type == DataFrameType.PANDAS + assert result.dataframe_row_count == 3 + assert result.total_duration_seconds > 0 + + +def test_result_contains_expectation_details(): + """Test that result contains detailed expectation results.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=0, tags=["priority:high"]) + suite.expect_value_greater_than(column_name="col1", value=100, tags=["priority:low"]) + + df = pd.DataFrame({"col1": [10, 20, 30]}) + runner = suite.build() + + result = runner.run(data_frame=df, raise_on_failure=False) + + assert result is not None + assert isinstance(result, SuiteExecutionResult) + assert len(result.results) == 2 + + # Check passed expectation + passed = result.passed_expectations[0] + assert passed.expectation_name == "ExpectationValueGreaterThan" + assert passed.status == ExpectationStatus.PASSED + assert passed.tags is not None + assert len(passed.tags) == 1 + + # Check failed expectation + failed = result.failed_expectations[0] + assert failed.expectation_name == "ExpectationValueGreaterThan" + assert failed.status == ExpectationStatus.FAILED + assert failed.error_message is not None + assert failed.violation_count is not None + + +def test_result_with_skipped_expectations(): + """Test that result includes skipped expectations.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=0, tags=["priority:high"]) + suite.expect_value_less_than(column_name="col1", value=100, tags=["priority:low"]) + + df = pd.DataFrame({"col1": [10, 20, 30]}) + runner = suite.build(tags=["priority:high"], tag_match_mode="any") + + result = runner.run(data_frame=df) + + assert result is not None + assert isinstance(result, SuiteExecutionResult) + assert result.total_expectations == 2 + assert result.total_passed == 1 + assert result.total_skipped == 1 + + skipped = result.skipped_expectations[0] + assert skipped.status == ExpectationStatus.SKIPPED + assert skipped.expectation_name == "ExpectationValueLessThan" + + +def test_result_applied_filters(): + """Test that result captures applied tag filters.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=0, tags=["priority:high"]) + + df = pd.DataFrame({"col1": [10]}) + runner = suite.build(tags=["priority:high"], tag_match_mode="any") + + result = runner.run(data_frame=df) + + assert result is not None + assert len(result.applied_filters) == 1 + assert result.tag_match_mode == "any" + + +def test_pyspark_caching_enabled(spark): + """Test that PySpark DataFrame is cached during execution.""" + suite = DataFrameExpectationsSuite() + suite.expect_min_rows(min_rows=1) + + df = spark.createDataFrame([(1,), (2,), (3,)], ["col1"]) + runner = suite.build() + + # DataFrame should not be cached initially + assert not df.is_cached + + result = runner.run(data_frame=df) + + # Should record that it wasn't cached before + assert result is not None + assert result.dataframe_was_cached is False + assert result.dataframe_type == DataFrameType.PYSPARK + + # DataFrame should be uncached after execution + assert not df.is_cached + + +def test_pyspark_already_cached(spark): + """Test behavior when PySpark DataFrame is already cached.""" + suite = DataFrameExpectationsSuite() + suite.expect_min_rows(min_rows=1) + + df = spark.createDataFrame([(1,), (2,), (3,)], ["col1"]) + df.cache() # Pre-cache + + runner = suite.build() + result = runner.run(data_frame=df) + + assert result is not None + assert result.dataframe_was_cached is True + assert df.is_cached # Should still be cached + + df.unpersist() + + +def test_exception_contains_failures(): + """Test that exception contains failure details.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=100) + suite.expect_value_less_than(column_name="col1", value=5) + + df = pd.DataFrame({"col1": [10, 20, 30]}) + runner = suite.build() + + with pytest.raises(DataFrameExpectationsSuiteFailure) as exc_info: + runner.run(data_frame=df) + + exception = exc_info.value + assert exception.total_expectations == 2 + assert len(exception.failures) == 2 + assert exception.result is not None + + +def test_exception_string_representation(): + """Test exception string representation.""" + suite = DataFrameExpectationsSuite() + suite.expect_value_greater_than(column_name="col1", value=100) + + df = pd.DataFrame({"col1": [10, 20, 30]}) + runner = suite.build() + + with pytest.raises(DataFrameExpectationsSuiteFailure) as exc_info: + runner.run(data_frame=df) + + exception_str = str(exc_info.value) + assert "(1/1) expectations failed" in exception_str + assert "List of violations:" in exception_str diff --git a/tests/base/test_tagging.py b/tests/base/test_tagging.py new file mode 100644 index 0000000..e130b1f --- /dev/null +++ b/tests/base/test_tagging.py @@ -0,0 +1,258 @@ +"""Comprehensive unit tests for TagSet tagging system.""" + +import pytest +from dataframe_expectations.core.tagging import TagSet + + +@pytest.mark.parametrize( + "tags, expected_count", + [ + (["priority:high"], 1), + (["priority:high", "env:test"], 2), + (["priority:high", "priority:medium"], 2), + (["priority:high", "env:test", "env:prod"], 3), + ([], 0), + (None, 0), + ], +) +def test_initialization_valid_tags(tags, expected_count): + """Test initialization with valid tags.""" + tag_set = TagSet(tags) + assert len(tag_set) == expected_count + + +@pytest.mark.parametrize( + "invalid_tag, error_pattern", + [ + ("notag", "Invalid tag format"), + ("no-colon", "Invalid tag format"), + (":no-key", "key and value must be non-empty"), + ("no-value:", "key and value must be non-empty"), + (" : ", "key and value must be non-empty"), + ], +) +def test_initialization_invalid_format(invalid_tag, error_pattern): + """Test that invalid tag formats raise ValueError.""" + with pytest.raises(ValueError, match=error_pattern): + TagSet([invalid_tag]) + + +def test_multiple_colons_allowed(): + """Test that multiple colons are allowed (split on first colon).""" + tag_set = TagSet(["url:https://example.com"]) + assert len(tag_set) == 1 + assert tag_set.tags["url"] == {"https://example.com"} + + +def test_multiple_values_same_key(): + """Test that multiple values for the same key are stored correctly.""" + tag_set = TagSet(["priority:high", "priority:medium", "priority:low"]) + assert len(tag_set) == 3 + # Verify internal structure + assert "priority" in tag_set.tags + assert tag_set.tags["priority"] == {"high", "medium", "low"} + + +def test_whitespace_handling(): + """Test that whitespace is properly handled.""" + tag_set = TagSet([" priority : high ", "env:test"]) + assert len(tag_set) == 2 + + +@pytest.mark.parametrize( + "self_tags, other_tags, expected", + [ + # Single key matching + (["priority:high"], ["priority:high"], True), + (["priority:high"], ["priority:medium"], False), + # Multiple keys, any matches + (["priority:high", "env:test"], ["priority:high"], True), + (["priority:high", "env:test"], ["env:test"], True), + (["priority:high", "env:test"], ["priority:high", "env:test"], True), + # Multiple keys, none match + (["priority:high", "env:test"], ["priority:low", "env:prod"], False), + # Multiple values per key + (["priority:high", "priority:medium"], ["priority:high"], True), + (["priority:high", "priority:medium"], ["priority:low"], False), + (["priority:high", "priority:medium"], ["priority:medium", "priority:low"], True), + # Empty cases + (["priority:high"], [], True), # Empty other matches everything + ([], ["priority:high"], False), # Empty self matches nothing + # Complex scenarios + ( + ["priority:high", "priority:medium", "env:test", "role:admin"], + ["priority:low", "env:test"], + True, + ), # env:test matches + ( + ["priority:high", "env:test"], + ["priority:medium", "env:prod", "role:admin"], + False, + ), + ], +) +def test_has_any_tag_from(self_tags, other_tags, expected): + """Test has_any_tag_from with various combinations (OR logic).""" + tag_set = TagSet(self_tags) + other = TagSet(other_tags) + assert tag_set.has_any_tag_from(other) == expected + + +@pytest.mark.parametrize( + "self_tags, other_tags, expected", + [ + # Single key matching + (["priority:high"], ["priority:high"], True), + (["priority:high"], ["priority:medium"], False), + # Multiple keys, all must match + (["priority:high", "env:test"], ["priority:high"], True), # All of other present + (["priority:high", "env:test"], ["env:test"], True), # All of other present + ( + ["priority:high", "env:test"], + ["priority:high", "env:test"], + True, + ), # All of other present + ( + ["priority:high", "env:test"], + ["priority:high", "env:prod"], + False, + ), # env:prod not present + # Multiple values per key - ALL must be present + (["priority:high", "priority:medium"], ["priority:high"], True), + ( + ["priority:high", "priority:medium"], + ["priority:high", "priority:medium"], + True, + ), + (["priority:high"], ["priority:high", "priority:medium"], False), # medium not present + # Empty cases + (["priority:high"], [], True), # Empty other matches everything + ([], ["priority:high"], False), # Empty self matches nothing + # Complex scenarios + ( + ["priority:high", "priority:medium", "env:test", "role:admin"], + ["priority:high", "env:test"], + True, + ), + ( + ["priority:high", "priority:medium", "env:test", "role:admin"], + ["priority:high", "priority:medium", "env:test"], + True, + ), + ( + ["priority:high", "env:test"], + ["priority:high", "priority:medium", "env:test"], + False, + ), # priority:medium not present + ( + ["priority:high", "env:test"], + ["priority:high", "env:test", "role:admin"], + False, + ), # role:admin not present + ], +) +def test_has_all_tags_from(self_tags, other_tags, expected): + """Test has_all_tags_from with various combinations (AND logic).""" + tag_set = TagSet(self_tags) + other = TagSet(other_tags) + assert tag_set.has_all_tags_from(other) == expected + + +@pytest.mark.parametrize( + "tags, expected_empty", + [ + ([], True), + (None, True), + (["priority:high"], False), + (["priority:high", "env:test"], False), + ], +) +def test_is_empty(tags, expected_empty): + """Test is_empty method.""" + tag_set = TagSet(tags) + assert tag_set.is_empty() == expected_empty + + +@pytest.mark.parametrize( + "tags, expected_len", + [ + ([], 0), + (None, 0), + (["priority:high"], 1), + (["priority:high", "env:test"], 2), + (["priority:high", "priority:medium"], 2), + (["priority:high", "priority:medium", "env:test"], 3), + ], +) +def test_len(tags, expected_len): + """Test __len__ method.""" + tag_set = TagSet(tags) + assert len(tag_set) == expected_len + + +@pytest.mark.parametrize( + "tags, expected_bool", + [ + ([], False), + (None, False), + (["priority:high"], True), + (["priority:high", "env:test"], True), + ], +) +def test_bool(tags, expected_bool): + """Test __bool__ method.""" + tag_set = TagSet(tags) + assert bool(tag_set) == expected_bool + + +@pytest.mark.parametrize( + "tags, expected_str", + [ + ([], "TagSet(empty)"), + (None, "TagSet(empty)"), + (["priority:high"], "TagSet(priority:high)"), + (["env:test", "priority:high"], "TagSet(env:test, priority:high)"), # Alphabetically sorted + ( + ["priority:medium", "priority:high"], + "TagSet(priority:high, priority:medium)", + ), # Values sorted + ], +) +def test_str_representation(tags, expected_str): + """Test __str__ and __repr__ methods.""" + tag_set = TagSet(tags) + assert str(tag_set) == expected_str + assert repr(tag_set) == expected_str + + +def test_duplicate_tags(): + """Test that duplicate tags are deduplicated.""" + tag_set = TagSet(["priority:high", "priority:high", "env:test", "env:test"]) + assert len(tag_set) == 2 # Only unique tags counted + + +def test_case_sensitivity(): + """Test that tags are case-sensitive.""" + tag_set = TagSet(["priority:high", "priority:High", "Priority:high"]) + assert len(tag_set) == 3 # All three are different + + +def test_special_characters_in_values(): + """Test tags with special characters in values.""" + tag_set = TagSet(["url:https://example.com", "path:/usr/local/bin", "label:user-name"]) + assert len(tag_set) == 3 + + +def test_numeric_values(): + """Test tags with numeric-looking values.""" + tag_set = TagSet(["version:1.0", "port:8080", "priority:1"]) + assert len(tag_set) == 3 + + +def test_empty_string_handling(): + """Test that empty strings in key or value raise errors.""" + with pytest.raises(ValueError, match="key and value must be non-empty"): + TagSet([":value"]) + + with pytest.raises(ValueError, match="key and value must be non-empty"): + TagSet(["key:"]) diff --git a/tests/core/test_expectations.py b/tests/core/test_expectations.py index b73d528..613cb3f 100644 --- a/tests/core/test_expectations.py +++ b/tests/core/test_expectations.py @@ -3,8 +3,59 @@ import pandas as pd +from dataframe_expectations.core.column_expectation import DataFrameColumnExpectation from dataframe_expectations.core.types import DataFrameLike, DataFrameType from dataframe_expectations.core.expectation import DataFrameExpectation +from dataframe_expectations.core.tagging import TagSet +from dataframe_expectations.expectations.column.numerical import ( + create_expectation_value_greater_than, + create_expectation_value_less_than, + create_expectation_value_between, +) +from dataframe_expectations.expectations.column.any_value import ( + create_expectation_value_equals, + create_expectation_value_not_equals, + create_expectation_value_null, + create_expectation_value_not_null, + create_expectation_value_in, + create_expectation_value_not_in, +) +from dataframe_expectations.expectations.column.string import ( + create_expectation_string_contains, + create_expectation_string_length_equals, + create_expectation_string_length_between, +) +from dataframe_expectations.expectations.aggregation.any_value import ( + ExpectationMinRows, + ExpectationMaxRows, + ExpectationMaxNullPercentage, + ExpectationMaxNullCount, + create_expectation_max_null_count, + create_expectation_max_null_percentage, + create_expectation_max_rows, + create_expectation_min_rows, +) +from dataframe_expectations.expectations.aggregation.numerical import ( + ExpectationColumnQuantileBetween, + ExpectationColumnMeanBetween, + create_expectation_column_max_to_be_between, + create_expectation_column_median_to_be_between, + create_expectation_column_min_to_be_between, + create_expectation_column_quantile_between, + create_expectation_column_mean_to_be_between, +) +from dataframe_expectations.expectations.aggregation.unique import ( + ExpectationUniqueRows, + ExpectationDistinctColumnValuesEquals, + ExpectationDistinctColumnValuesLessThan, + ExpectationDistinctColumnValuesGreaterThan, + ExpectationDistinctColumnValuesBetween, + create_expectation_distinct_column_values_equals, + create_expectation_unique, + create_expectation_distinct_column_values_less_than, + create_expectation_distinct_column_values_greater_than, + create_expectation_distinct_column_values_between, +) class MyTestExpectation(DataFrameExpectation): @@ -300,3 +351,162 @@ def test_infer_data_frame_type_connect_import_behavior(spark): assert result_type == DataFrameType.PYSPARK, ( f"Expected PYSPARK type for Connect DataFrame but got: {result_type}" ) + + +@pytest.mark.parametrize( + "tag_list, expected_count, expected_empty", + [ + (None, 0, True), + ([], 0, True), + (["priority:high"], 1, False), + (["priority:high", "env:prod"], 2, False), + (["priority:high", "env:prod", "team:data", "critical:true"], 4, False), + (["priority:high", "priority:high", "env:prod"], 2, False), # Deduplication + ], +) +def test_tags_initialization(tag_list, expected_count, expected_empty): + """Test that DataFrameExpectation properly initializes tags.""" + expectation = MyTestExpectation(tags=tag_list) + tags = expectation.get_tags() + + assert isinstance(tags, TagSet) + assert len(tags) == expected_count + assert tags.is_empty() == expected_empty + + +def test_tags_propagation_to_subclass(): + """Test that tags are properly propagated to subclasses.""" + + class MySubclassExpectation(MyTestExpectation): + def __init__(self, custom_param: str, tags=None): + super().__init__(tags=tags) + self.custom_param = custom_param + + expectation_no_tags = MySubclassExpectation(custom_param="test") + assert expectation_no_tags.get_tags().is_empty() + + expectation_with_tags = MySubclassExpectation(custom_param="test", tags=["priority:high"]) + assert len(expectation_with_tags.get_tags()) == 1 + + +def test_tags_immutability(): + """Test that get_tags() returns the same TagSet instance.""" + expectation = MyTestExpectation(tags=["priority:high"]) + tags1 = expectation.get_tags() + tags2 = expectation.get_tags() + + assert isinstance(tags1, TagSet) + assert tags1 is tags2 + + +def test_tags_with_invalid_format(): + """Test that invalid tag formats raise appropriate errors.""" + with pytest.raises(ValueError, match="Invalid tag format"): + MyTestExpectation(tags=["invalid-tag-no-colon"]) + + +@pytest.mark.parametrize( + "factory_fn, kwargs", + [ + ( + DataFrameColumnExpectation, + { + "expectation_name": "TestColumn", + "column_name": "test", + "fn_violations_pandas": lambda df: df, + "fn_violations_pyspark": lambda df: df, + "description": "Test", + "error_message": "Error", + }, + ), + # Column expectations (factory-created) + (create_expectation_value_greater_than, {"column_name": "col", "value": 10}), + (create_expectation_value_less_than, {"column_name": "col", "value": 10}), + (create_expectation_value_between, {"column_name": "col", "min_value": 1, "max_value": 10}), + (create_expectation_value_equals, {"column_name": "col", "value": 10}), + (create_expectation_value_not_equals, {"column_name": "col", "value": 10}), + (create_expectation_value_null, {"column_name": "col"}), + (create_expectation_value_not_null, {"column_name": "col"}), + (create_expectation_value_in, {"column_name": "col", "values": [1, 2, 3]}), + (create_expectation_value_not_in, {"column_name": "col", "values": [1, 2, 3]}), + (create_expectation_string_contains, {"column_name": "col", "substring": "test"}), + (create_expectation_string_length_equals, {"column_name": "col", "length": 5}), + ( + create_expectation_string_length_between, + {"column_name": "col", "min_length": 1, "max_length": 10}, + ), + # Aggregation expectations (class-based) + (ExpectationMinRows, {"min_rows": 10}), + (ExpectationMaxRows, {"max_rows": 100}), + (ExpectationMaxNullPercentage, {"column_name": "col", "max_percentage": 0.1}), + (ExpectationMaxNullCount, {"column_name": "col", "max_count": 10}), + ( + ExpectationColumnQuantileBetween, + {"column_name": "col", "quantile": 0.5, "min_value": 1, "max_value": 10}, + ), + (ExpectationColumnMeanBetween, {"column_name": "col", "min_value": 1, "max_value": 10}), + (ExpectationUniqueRows, {"column_names": ["col1", "col2"]}), + (ExpectationDistinctColumnValuesEquals, {"column_name": "col", "expected_value": 10}), + (ExpectationDistinctColumnValuesLessThan, {"column_name": "col", "threshold": 10}), + (ExpectationDistinctColumnValuesGreaterThan, {"column_name": "col", "threshold": 10}), + ( + ExpectationDistinctColumnValuesBetween, + {"column_name": "col", "min_value": 1, "max_value": 10}, + ), + # Aggregation expectations (factory-created) + (create_expectation_min_rows, {"min_rows": 10}), + (create_expectation_max_rows, {"max_rows": 100}), + (create_expectation_max_null_percentage, {"column_name": "col", "max_percentage": 0.1}), + (create_expectation_max_null_count, {"column_name": "col", "max_count": 10}), + ( + create_expectation_column_quantile_between, + {"column_name": "col", "quantile": 0.5, "min_value": 1, "max_value": 10}, + ), + ( + create_expectation_column_max_to_be_between, + {"column_name": "col", "min_value": 1, "max_value": 10}, + ), + ( + create_expectation_column_min_to_be_between, + {"column_name": "col", "min_value": 1, "max_value": 10}, + ), + ( + create_expectation_column_mean_to_be_between, + {"column_name": "col", "min_value": 1, "max_value": 10}, + ), + ( + create_expectation_column_median_to_be_between, + {"column_name": "col", "min_value": 1, "max_value": 10}, + ), + (create_expectation_unique, {"column_names": ["col1", "col2"]}), + ( + create_expectation_distinct_column_values_equals, + {"column_name": "col", "expected_value": 10}, + ), + ( + create_expectation_distinct_column_values_less_than, + {"column_name": "col", "threshold": 10}, + ), + ( + create_expectation_distinct_column_values_greater_than, + {"column_name": "col", "threshold": 10}, + ), + ( + create_expectation_distinct_column_values_between, + {"column_name": "col", "min_value": 1, "max_value": 10}, + ), + ], +) +def test_tags_sent_to_base_class(factory_fn, kwargs): + """ + Test that tags are properly propagated to all expectation classes. + + Covers all direct and indirect children of DataFrameExpectation: + - DataFrameColumnExpectation (direct child) + - DataFrameAggregationExpectation (direct child) + - All factory-created column expectations + - All class-based aggregation expectations + """ + test_tags = ["priority:high", "env:test"] + expectation = factory_fn(**kwargs, tags=test_tags) + assert len(expectation.get_tags()) == 2 diff --git a/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_between.py b/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_between.py index df12a3f..7393b45 100644 --- a/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_between.py +++ b/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_between.py @@ -8,6 +8,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -276,7 +277,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_equals.py b/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_equals.py index 684a6e5..148b3f4 100644 --- a/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_equals.py +++ b/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_equals.py @@ -9,6 +9,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -294,7 +295,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_greater_than.py b/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_greater_than.py index f22fb7a..ecb86a6 100644 --- a/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_greater_than.py +++ b/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_greater_than.py @@ -9,6 +9,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -307,7 +308,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_less_than.py b/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_less_than.py index 3fafdff..3e74d45 100644 --- a/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_less_than.py +++ b/tests/expectations/aggregation/any_value_expectations/test_expect_distinct_column_values_less_than.py @@ -9,6 +9,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -336,7 +337,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/any_value_expectations/test_expect_max_null_count.py b/tests/expectations/aggregation/any_value_expectations/test_expect_max_null_count.py index 11bf923..98f461e 100644 --- a/tests/expectations/aggregation/any_value_expectations/test_expect_max_null_count.py +++ b/tests/expectations/aggregation/any_value_expectations/test_expect_max_null_count.py @@ -8,6 +8,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -229,7 +230,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/any_value_expectations/test_expect_max_null_percentage.py b/tests/expectations/aggregation/any_value_expectations/test_expect_max_null_percentage.py index 17f02d8..b874ce7 100644 --- a/tests/expectations/aggregation/any_value_expectations/test_expect_max_null_percentage.py +++ b/tests/expectations/aggregation/any_value_expectations/test_expect_max_null_percentage.py @@ -8,6 +8,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -265,7 +266,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/any_value_expectations/test_expect_max_rows.py b/tests/expectations/aggregation/any_value_expectations/test_expect_max_rows.py index aa82067..4260302 100644 --- a/tests/expectations/aggregation/any_value_expectations/test_expect_max_rows.py +++ b/tests/expectations/aggregation/any_value_expectations/test_expect_max_rows.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -273,7 +274,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/any_value_expectations/test_expect_min_rows.py b/tests/expectations/aggregation/any_value_expectations/test_expect_min_rows.py index 48e8b73..8d63110 100644 --- a/tests/expectations/aggregation/any_value_expectations/test_expect_min_rows.py +++ b/tests/expectations/aggregation/any_value_expectations/test_expect_min_rows.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -333,7 +334,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/any_value_expectations/test_expect_unique_rows.py b/tests/expectations/aggregation/any_value_expectations/test_expect_unique_rows.py index 2b8e795..d371536 100644 --- a/tests/expectations/aggregation/any_value_expectations/test_expect_unique_rows.py +++ b/tests/expectations/aggregation/any_value_expectations/test_expect_unique_rows.py @@ -8,6 +8,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -272,7 +273,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/numerical_expectations/test_expect_column_max_between.py b/tests/expectations/aggregation/numerical_expectations/test_expect_column_max_between.py index 67f856d..35d7976 100644 --- a/tests/expectations/aggregation/numerical_expectations/test_expect_column_max_between.py +++ b/tests/expectations/aggregation/numerical_expectations/test_expect_column_max_between.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -208,7 +209,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/numerical_expectations/test_expect_column_mean_between.py b/tests/expectations/aggregation/numerical_expectations/test_expect_column_mean_between.py index 48c12ad..7fbecb0 100644 --- a/tests/expectations/aggregation/numerical_expectations/test_expect_column_mean_between.py +++ b/tests/expectations/aggregation/numerical_expectations/test_expect_column_mean_between.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -242,7 +243,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/aggregation/numerical_expectations/test_expect_column_median_between.py b/tests/expectations/aggregation/numerical_expectations/test_expect_column_median_between.py index 2a332d7..54d5ea7 100644 --- a/tests/expectations/aggregation/numerical_expectations/test_expect_column_median_between.py +++ b/tests/expectations/aggregation/numerical_expectations/test_expect_column_median_between.py @@ -297,7 +297,8 @@ def test_expectation_basic_scenarios( if should_succeed: suite_result = suite.build().run(data_frame=df) - assert suite_result is None, f"Suite test expected None but got: {suite_result}" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert suite_result.success, "Expected all expectations to pass" else: with pytest.raises(DataFrameExpectationsSuiteFailure): suite.build().run(data_frame=df) diff --git a/tests/expectations/aggregation/numerical_expectations/test_expect_column_min_between.py b/tests/expectations/aggregation/numerical_expectations/test_expect_column_min_between.py index e241004..58230bc 100644 --- a/tests/expectations/aggregation/numerical_expectations/test_expect_column_min_between.py +++ b/tests/expectations/aggregation/numerical_expectations/test_expect_column_min_between.py @@ -277,7 +277,8 @@ def test_expectation_basic_scenarios( if should_succeed: suite_result = suite.build().run(data_frame=df) - assert suite_result is None, f"Suite test expected None but got: {suite_result}" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert suite_result.success, "Expected all expectations to pass" else: with pytest.raises(DataFrameExpectationsSuiteFailure): suite.build().run(data_frame=df) diff --git a/tests/expectations/aggregation/numerical_expectations/test_expect_column_quantile_between.py b/tests/expectations/aggregation/numerical_expectations/test_expect_column_quantile_between.py index 8742bdd..bfe9bb8 100644 --- a/tests/expectations/aggregation/numerical_expectations/test_expect_column_quantile_between.py +++ b/tests/expectations/aggregation/numerical_expectations/test_expect_column_quantile_between.py @@ -260,7 +260,8 @@ def test_expectation_basic_scenarios( if should_succeed: suite_result = suite.build().run(data_frame=df) - assert suite_result is None, f"Suite test expected None but got: {suite_result}" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert suite_result.success, "Expected all expectations to pass" else: with pytest.raises(DataFrameExpectationsSuiteFailure): suite.build().run(data_frame=df) diff --git a/tests/expectations/column/any_value_expectations/test_expect_value_equals.py b/tests/expectations/column/any_value_expectations/test_expect_value_equals.py index e9d35f1..32418ed 100644 --- a/tests/expectations/column/any_value_expectations/test_expect_value_equals.py +++ b/tests/expectations/column/any_value_expectations/test_expect_value_equals.py @@ -8,6 +8,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -430,7 +431,13 @@ def test_expectation_basic_scenarios( if should_succeed: suite_result = suite.build().run(data_frame=df) - assert suite_result is None, f"Suite test expected None but got: {suite_result}" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert isinstance(suite_result, SuiteExecutionResult), ( + "Result should be SuiteExecutionResult" + ) + assert suite_result.success, "Expected all expectations to pass" + assert suite_result.total_passed == 1, "Expected 1 passed expectation" + assert suite_result.total_failed == 0, "Expected 0 failed expectations" else: with pytest.raises(DataFrameExpectationsSuiteFailure): suite.build().run(data_frame=df) diff --git a/tests/expectations/column/any_value_expectations/test_expect_value_in.py b/tests/expectations/column/any_value_expectations/test_expect_value_in.py index 66a0ef8..f40c4ff 100644 --- a/tests/expectations/column/any_value_expectations/test_expect_value_in.py +++ b/tests/expectations/column/any_value_expectations/test_expect_value_in.py @@ -8,6 +8,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -385,7 +386,13 @@ def test_expectation_basic_scenarios( if should_succeed: suite_result = suite.build().run(data_frame=df) - assert suite_result is None, f"Suite test expected None but got: {suite_result}" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert isinstance(suite_result, SuiteExecutionResult), ( + "Result should be SuiteExecutionResult" + ) + assert suite_result.success, "Expected all expectations to pass" + assert suite_result.total_passed == 1, "Expected 1 passed expectation" + assert suite_result.total_failed == 0, "Expected 0 failed expectations" else: with pytest.raises(DataFrameExpectationsSuiteFailure): suite.build().run(data_frame=df) diff --git a/tests/expectations/column/any_value_expectations/test_expect_value_not_equals.py b/tests/expectations/column/any_value_expectations/test_expect_value_not_equals.py index 443ec3a..af15e10 100644 --- a/tests/expectations/column/any_value_expectations/test_expect_value_not_equals.py +++ b/tests/expectations/column/any_value_expectations/test_expect_value_not_equals.py @@ -8,6 +8,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -524,7 +525,13 @@ def test_expectation_basic_scenarios( if should_succeed: suite_result = suite.build().run(data_frame=df) - assert suite_result is None, f"Suite test expected None but got: {suite_result}" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert isinstance(suite_result, SuiteExecutionResult), ( + "Result should be SuiteExecutionResult" + ) + assert suite_result.success, "Expected all expectations to pass" + assert suite_result.total_passed == 1, "Expected 1 passed expectation" + assert suite_result.total_failed == 0, "Expected 0 failed expectations" else: with pytest.raises(DataFrameExpectationsSuiteFailure): suite.build().run(data_frame=df) diff --git a/tests/expectations/column/any_value_expectations/test_expect_value_not_in.py b/tests/expectations/column/any_value_expectations/test_expect_value_not_in.py index 2b5e1e8..8a4a618 100644 --- a/tests/expectations/column/any_value_expectations/test_expect_value_not_in.py +++ b/tests/expectations/column/any_value_expectations/test_expect_value_not_in.py @@ -8,6 +8,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -567,7 +568,13 @@ def test_expectation_basic_scenarios( if should_succeed: suite_result = suite.build().run(data_frame=df) - assert suite_result is None, f"Suite test expected None but got: {suite_result}" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert isinstance(suite_result, SuiteExecutionResult), ( + "Result should be SuiteExecutionResult" + ) + assert suite_result.success, "Expected all expectations to pass" + assert suite_result.total_passed == 1, "Expected 1 passed expectation" + assert suite_result.total_failed == 0, "Expected 0 failed expectations" else: with pytest.raises(DataFrameExpectationsSuiteFailure): suite.build().run(data_frame=df) diff --git a/tests/expectations/column/any_value_expectations/test_expect_value_not_null.py b/tests/expectations/column/any_value_expectations/test_expect_value_not_null.py index a11a52d..028dd6a 100644 --- a/tests/expectations/column/any_value_expectations/test_expect_value_not_null.py +++ b/tests/expectations/column/any_value_expectations/test_expect_value_not_null.py @@ -9,6 +9,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -457,7 +458,13 @@ def test_expectation_basic_scenarios( if should_succeed: suite_result = suite.build().run(data_frame=df) - assert suite_result is None, f"Suite test expected None but got: {suite_result}" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert isinstance(suite_result, SuiteExecutionResult), ( + "Result should be SuiteExecutionResult" + ) + assert suite_result.success, "Expected all expectations to pass" + assert suite_result.total_passed == 1, "Expected 1 passed expectation" + assert suite_result.total_failed == 0, "Expected 0 failed expectations" else: with pytest.raises(DataFrameExpectationsSuiteFailure): suite.build().run(data_frame=df) diff --git a/tests/expectations/column/any_value_expectations/test_expect_value_null.py b/tests/expectations/column/any_value_expectations/test_expect_value_null.py index 69be833..20651d3 100644 --- a/tests/expectations/column/any_value_expectations/test_expect_value_null.py +++ b/tests/expectations/column/any_value_expectations/test_expect_value_null.py @@ -9,6 +9,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -566,7 +567,13 @@ def test_expectation_basic_scenarios( if should_succeed: suite_result = suite.build().run(data_frame=df) - assert suite_result is None, f"Suite test expected None but got: {suite_result}" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert isinstance(suite_result, SuiteExecutionResult), ( + "Result should be SuiteExecutionResult" + ) + assert suite_result.success, "Expected all expectations to pass" + assert suite_result.total_passed == 1, "Expected 1 passed expectation" + assert suite_result.total_failed == 0, "Expected 0 failed expectations" else: with pytest.raises(DataFrameExpectationsSuiteFailure): suite.build().run(data_frame=df) diff --git a/tests/expectations/column/numerical_expectations/test_expect_value_between.py b/tests/expectations/column/numerical_expectations/test_expect_value_between.py index 78ade63..399a8da 100644 --- a/tests/expectations/column/numerical_expectations/test_expect_value_between.py +++ b/tests/expectations/column/numerical_expectations/test_expect_value_between.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -465,7 +466,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/column/numerical_expectations/test_expect_value_greater_than.py b/tests/expectations/column/numerical_expectations/test_expect_value_greater_than.py index 858fe36..1153ac4 100644 --- a/tests/expectations/column/numerical_expectations/test_expect_value_greater_than.py +++ b/tests/expectations/column/numerical_expectations/test_expect_value_greater_than.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -576,7 +577,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/column/numerical_expectations/test_expect_value_less_than.py b/tests/expectations/column/numerical_expectations/test_expect_value_less_than.py index a496a80..f80bb08 100644 --- a/tests/expectations/column/numerical_expectations/test_expect_value_less_than.py +++ b/tests/expectations/column/numerical_expectations/test_expect_value_less_than.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -576,7 +577,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/column/string_expectations/test_expect_string_contains.py b/tests/expectations/column/string_expectations/test_expect_string_contains.py index bc8cfe4..641df87 100644 --- a/tests/expectations/column/string_expectations/test_expect_string_contains.py +++ b/tests/expectations/column/string_expectations/test_expect_string_contains.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -458,7 +459,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/column/string_expectations/test_expect_string_ends_with.py b/tests/expectations/column/string_expectations/test_expect_string_ends_with.py index 9886c26..42e9b9e 100644 --- a/tests/expectations/column/string_expectations/test_expect_string_ends_with.py +++ b/tests/expectations/column/string_expectations/test_expect_string_ends_with.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -480,7 +481,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/column/string_expectations/test_expect_string_length_between.py b/tests/expectations/column/string_expectations/test_expect_string_length_between.py index 3dc8197..4b68256 100644 --- a/tests/expectations/column/string_expectations/test_expect_string_length_between.py +++ b/tests/expectations/column/string_expectations/test_expect_string_length_between.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -476,7 +477,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/column/string_expectations/test_expect_string_length_equals.py b/tests/expectations/column/string_expectations/test_expect_string_length_equals.py index 691fb58..9149142 100644 --- a/tests/expectations/column/string_expectations/test_expect_string_length_equals.py +++ b/tests/expectations/column/string_expectations/test_expect_string_length_equals.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -444,7 +445,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/column/string_expectations/test_expect_string_length_greater_than.py b/tests/expectations/column/string_expectations/test_expect_string_length_greater_than.py index 2e5c7ac..3fa3d0b 100644 --- a/tests/expectations/column/string_expectations/test_expect_string_length_greater_than.py +++ b/tests/expectations/column/string_expectations/test_expect_string_length_greater_than.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -439,7 +440,11 @@ def test_expectation_basic_scenarios( if expected_result == "success": result = expectations_suite.build().run(data_frame=data_frame) - assert result is None, "Expected no exceptions to be raised from suite" + assert result is not None, "Expected SuiteExecutionResult" + assert isinstance(result, SuiteExecutionResult), "Result should be SuiteExecutionResult" + assert result.success, "Expected all expectations to pass" + assert result.total_passed == 1, "Expected 1 passed expectation" + assert result.total_failed == 0, "Expected 0 failed expectations" else: # failure with pytest.raises(DataFrameExpectationsSuiteFailure): expectations_suite.build().run(data_frame=data_frame) diff --git a/tests/expectations/column/string_expectations/test_expect_string_length_less_than.py b/tests/expectations/column/string_expectations/test_expect_string_length_less_than.py index ee8b179..7cca514 100644 --- a/tests/expectations/column/string_expectations/test_expect_string_length_less_than.py +++ b/tests/expectations/column/string_expectations/test_expect_string_length_less_than.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -399,7 +400,13 @@ def test_expectation_basic_scenarios( column_name="col1", length=length ) suite_result = expectations_suite.build().run(data_frame=data_frame) - assert suite_result is None, "Expected no exceptions to be raised" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert isinstance(suite_result, SuiteExecutionResult), ( + "Result should be SuiteExecutionResult" + ) + assert suite_result.success, "Expected all expectations to pass" + assert suite_result.total_passed == 1, "Expected 1 passed expectation" + assert suite_result.total_failed == 0, "Expected 0 failed expectations" else: # violations violations_df = create_dataframe(df_type, expected_violations, "col1", spark) expected_message = f"Found {len(expected_violations)} row(s) where 'col1' length is not less than {length}." diff --git a/tests/expectations/column/string_expectations/test_expect_string_not_contains.py b/tests/expectations/column/string_expectations/test_expect_string_not_contains.py index 0996424..6350736 100644 --- a/tests/expectations/column/string_expectations/test_expect_string_not_contains.py +++ b/tests/expectations/column/string_expectations/test_expect_string_not_contains.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -513,7 +514,13 @@ def test_expectation_basic_scenarios( column_name="col1", substring=substring ) suite_result = expectations_suite.build().run(data_frame=data_frame) - assert suite_result is None, "Expected no exceptions to be raised" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert isinstance(suite_result, SuiteExecutionResult), ( + "Result should be SuiteExecutionResult" + ) + assert suite_result.success, "Expected all expectations to pass" + assert suite_result.total_passed == 1, "Expected 1 passed expectation" + assert suite_result.total_failed == 0, "Expected 0 failed expectations" else: # violations violations_df = create_dataframe(df_type, expected_violations, "col1", spark) expected_message = ( diff --git a/tests/expectations/column/string_expectations/test_expect_string_starts_with.py b/tests/expectations/column/string_expectations/test_expect_string_starts_with.py index 9a2c577..3ac997b 100644 --- a/tests/expectations/column/string_expectations/test_expect_string_starts_with.py +++ b/tests/expectations/column/string_expectations/test_expect_string_starts_with.py @@ -7,6 +7,7 @@ from dataframe_expectations.suite import ( DataFrameExpectationsSuite, DataFrameExpectationsSuiteFailure, + SuiteExecutionResult, ) from dataframe_expectations.result_message import ( DataFrameExpectationFailureMessage, @@ -303,7 +304,13 @@ def test_expectation_basic_scenarios( column_name="col1", prefix=prefix ) suite_result = expectations_suite.build().run(data_frame=data_frame) - assert suite_result is None, "Expected no exceptions to be raised" + assert suite_result is not None, "Expected SuiteExecutionResult" + assert isinstance(suite_result, SuiteExecutionResult), ( + "Result should be SuiteExecutionResult" + ) + assert suite_result.success, "Expected all expectations to pass" + assert suite_result.total_passed == 1, "Expected 1 passed expectation" + assert suite_result.total_failed == 0, "Expected 0 failed expectations" else: # violations violations_df = create_dataframe(df_type, expected_violations, "col1", spark) expected_message = ( diff --git a/uv.lock b/uv.lock index 1f8851f..2a753d6 100644 --- a/uv.lock +++ b/uv.lock @@ -323,6 +323,7 @@ dev = [ { name = "pre-commit" }, { name = "pytest" }, { name = "pytest-cov" }, + { name = "pytest-xdist" }, { name = "ruff" }, ] docs = [ @@ -349,6 +350,7 @@ dev = [ { name = "pre-commit", specifier = ">=2.20.0" }, { name = "pytest", specifier = ">=7.0.0" }, { name = "pytest-cov", specifier = ">=4.0.0" }, + { name = "pytest-xdist", specifier = ">=3.0.0" }, { name = "ruff", specifier = ">=0.1.0" }, ] docs = [ @@ -391,6 +393,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" }, ] +[[package]] +name = "execnet" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" }, +] + [[package]] name = "filelock" version = "3.20.0" @@ -927,6 +938,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, ] +[[package]] +name = "pytest-xdist" +version = "3.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "execnet" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0"