|
2 | 2 | import logging |
3 | 3 | import time |
4 | 4 | from abc import ABC, abstractmethod |
5 | | -from typing import Callable, List |
| 5 | +from dataclasses import dataclass |
| 6 | +from typing import Callable, List, Optional |
6 | 7 |
|
7 | 8 | import geopandas as gpd |
8 | 9 | import pandas as pd |
@@ -367,11 +368,25 @@ def validate( |
367 | 368 | print("\n[SCHEMA VALIDATION ERROR]") |
368 | 369 | print("First 10 failure cases:") |
369 | 370 | print(err.failure_cases.head(10).to_string(index=False)) |
370 | | - # Add each failure as a separate error |
| 371 | + # Add each failure as a custom error message |
371 | 372 | for _, row in err.failure_cases.iterrows(): |
372 | | - self.errors.append( |
373 | | - f"Schema validation failed for {row['column']}: {row['failure_case']}" |
374 | | - ) |
| 373 | + col = row.get("column", "") |
| 374 | + check = row.get("check", "") |
| 375 | + failure = row.get("failure_case", "") |
| 376 | + # Try to make the message as close as possible to your old custom ones |
| 377 | + if "mean should be roughly" in str(failure): |
| 378 | + msg = f"{col} mean appears outside expected range: {failure}" |
| 379 | + elif "standard deviation should be roughly" in str(failure): |
| 380 | + msg = f"{col} standard deviation appears outside expected range: {failure}" |
| 381 | + elif "max" in str(failure) or "min" in str(failure): |
| 382 | + msg = f"{col} values appear outside expected range: {failure}" |
| 383 | + elif "first quantile" in str(failure): |
| 384 | + msg = f"{col} first quantile appears outside expected range: {failure}" |
| 385 | + elif "third quantile" in str(failure): |
| 386 | + msg = f"{col} third quantile appears outside expected range: {failure}" |
| 387 | + else: |
| 388 | + msg = f"{col} failed check '{check}': {failure}" |
| 389 | + self.errors.append(f"Schema validation failed: {msg}") |
375 | 390 | return ValidationResult(success=False, errors=self.errors.copy()) |
376 | 391 | schema_time = time.time() - schema_start |
377 | 392 |
|
@@ -685,3 +700,93 @@ def wrapper(gdf: gpd.GeoDataFrame = None, *args, **kwargs): |
685 | 700 | return wrapper |
686 | 701 |
|
687 | 702 | return decorator |
| 703 | + |
| 704 | + |
| 705 | +no_na_check = Check.ne("NA", error="Value cannot be NA") |
| 706 | + |
| 707 | +unique_check = Check(lambda s: s.is_unique, error="Should have all unique values") |
| 708 | + |
| 709 | + |
| 710 | +def unique_value_check(lower: int, upper: int) -> Check: |
| 711 | + return Check( |
| 712 | + lambda s: s.nunique() >= lower and s.nunique() < upper, |
| 713 | + error=f"Number of unique values is roughly between {lower} and {upper}", |
| 714 | + ) |
| 715 | + |
| 716 | + |
| 717 | +def null_percentage_check(null_percent: float) -> Check: |
| 718 | + return Check( |
| 719 | + lambda s: s.isnull().mean() >= 0.8 * null_percent |
| 720 | + and s.isnull().mean() <= 1.2 * null_percent, |
| 721 | + error=f"Percentage of nulls in column should be roughly {null_percent}", |
| 722 | + ) |
| 723 | + |
| 724 | + |
| 725 | +@dataclass |
| 726 | +class DistributionParams: |
| 727 | + min_value: Optional[int | float] = None |
| 728 | + max_value: Optional[int | float] = None |
| 729 | + mean: Optional[int | float] = None |
| 730 | + median: Optional[int | float] = None |
| 731 | + std: Optional[int | float] = None |
| 732 | + q1: Optional[int | float] = None |
| 733 | + q3: Optional[int | float] = None |
| 734 | + |
| 735 | + |
| 736 | +def distribution_check(params: DistributionParams) -> List[Check]: |
| 737 | + res = [] |
| 738 | + |
| 739 | + if params.min_value: |
| 740 | + res.append( |
| 741 | + Check(lambda s: pd.to_numeric(s, errors="coerce").min() >= params.min_value) |
| 742 | + ) |
| 743 | + if params.max_value: |
| 744 | + res.append( |
| 745 | + Check(lambda s: pd.to_numeric(s, errors="coerce").max() <= params.max_value) |
| 746 | + ) |
| 747 | + if params.mean: |
| 748 | + res.append( |
| 749 | + Check( |
| 750 | + lambda s: pd.to_numeric(s, errors="coerce").mean() >= 0.8 * params.mean |
| 751 | + and pd.to_numeric(s, errors="coerce").mean() <= 1.2 * params.mean, |
| 752 | + error=f"Column mean should be roughly {params.mean}", |
| 753 | + ) |
| 754 | + ) |
| 755 | + if params.median: |
| 756 | + res.append( |
| 757 | + Check( |
| 758 | + lambda s: pd.to_numeric(s, errors="coerce").quantile(0.5) |
| 759 | + >= 0.8 * params.median |
| 760 | + and pd.to_numeric(s, errors="coerce").quantile(0.5) |
| 761 | + <= 1.2 * params.median, |
| 762 | + error=f"Column median should be roughly {params.median}", |
| 763 | + ) |
| 764 | + ) |
| 765 | + if params.std: |
| 766 | + res.append( |
| 767 | + Check( |
| 768 | + lambda s: pd.to_numeric(s, errors="coerce").std() >= 0.8 * params.std |
| 769 | + and pd.to_numeric(s, errors="coerce").std() <= 1.2 * params.std, |
| 770 | + error=f"Column standard deviation should be roughly {params.std}", |
| 771 | + ) |
| 772 | + ) |
| 773 | + if params.q1: |
| 774 | + res.append( |
| 775 | + Check( |
| 776 | + lambda s: pd.to_numeric(s, errors="coerce").quantile(0.25) |
| 777 | + >= 0.8 * params.q1 |
| 778 | + and pd.to_numeric(s, errors="coerce").quantile(0.25) <= 1.2 * params.q1, |
| 779 | + error=f"Column first quantile should be roughly {params.q1}", |
| 780 | + ) |
| 781 | + ) |
| 782 | + if params.q3: |
| 783 | + res.append( |
| 784 | + Check( |
| 785 | + lambda s: pd.to_numeric(s, errors="coerce").quantile(0.75) |
| 786 | + >= 0.8 * params.q3 |
| 787 | + and pd.to_numeric(s, errors="coerce").quantile(0.75) <= 1.2 * params.q3, |
| 788 | + error=f"Column third quantile should be roughly {params.q3}", |
| 789 | + ) |
| 790 | + ) |
| 791 | + |
| 792 | + return res |
0 commit comments