Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/src/constants/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@

DELINQUENCIES_QUERY = "SELECT * FROM real_estate_tax_delinquencies"

OPA_PROPERTIES_QUERY = "SELECT building_code_description, market_value, sale_date, sale_price, parcel_number, location AS street_address, owner_1, owner_2, mailing_address_1, mailing_address_2, mailing_care_of, mailing_street, mailing_zip, mailing_city_state, unit, zip_code, zoning, the_geom FROM opa_properties_public"
OPA_PROPERTIES_QUERY = "SELECT building_code_description, market_value, sale_date, sale_price, parcel_number, location as street_address,owner_1, owner_2, mailing_address_1, mailing_address_2, mailing_care_of, mailing_street, mailing_zip, mailing_city_state, unit, zip_code, zoning, the_geom FROM opa_properties_public"

PWD_PARCELS_QUERY = "SELECT *, the_geom FROM pwd_parcels"

Expand Down
5 changes: 1 addition & 4 deletions data/src/data_utils/opa_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,6 @@ def opa_properties(
print(f"[OPA_PROPERTIES] load_or_fetch completed: {load_time:.3f}s")
print(f"[OPA_PROPERTIES] Loaded {len(opa)} rows")

print(opa.columns)
print(opa.head())

# Convert 'sale_price' and 'market_value' to numeric values
numeric_start = time.time()
print("[OPA_PROPERTIES] Converting sale_price and market_value to numeric")
Expand Down Expand Up @@ -241,7 +238,7 @@ def opa_properties(
print("[OPA_PROPERTIES] Combining street_address and unit")
opa["street_address"] = opa.apply(
lambda row: f"{row['street_address']} {row['unit']}"
if pd.notnull(row.get("unit")) and str(row["unit"]).strip() != ""
if pd.notnull(row["unit"]) and str(row["unit"]).strip() != ""
else row["street_address"],
axis=1,
)
Expand Down
20 changes: 1 addition & 19 deletions data/src/validation/access_process.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,12 @@
import geopandas as gpd
from pandera.pandas import Check, Column, DataFrameSchema

from .base import BaseValidator

output_schema = DataFrameSchema(
{
"access_process": Column(
str,
checks=Check.isin(
[
"Private Land Use Agreement",
"Go through Land Bank",
"PRA",
"Do Nothing",
"Buy Property",
]
),
)
}
)


class AccessProcessOutputValidator(BaseValidator):
"""Validator for access process service output."""

schema = output_schema
schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass
106 changes: 5 additions & 101 deletions data/src/validation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,10 @@
import logging
import time
from abc import ABC
from dataclasses import dataclass
from typing import Callable, List, Optional
from typing import Callable, List

import geopandas as gpd
import pandas as pd
import pandera.pandas as pa
from pandera import Check

from src.config.config import USE_CRS
from src.constants.city_limits import PHL_GEOMETRY
Expand All @@ -26,16 +23,13 @@ def __bool__(self):
class BaseValidator(ABC):
"""Base class for service-specific data validation."""

schema = None
schema: pa.DataFrameSchema = None

def __init_subclass__(cls):
schema = getattr(cls, "schema", None)
if schema is not None and not isinstance(schema, pa.DataFrameSchema):
print(type(schema))
print(isinstance(schema, type))
print(isinstance(schema, pa.DataFrameSchema))
raise TypeError(
f"{cls.__name__} must define a 'schema' class variable that is an instance of pandera.DataFrameSchema."
f"{cls.__name__} must define a 'schema' class variable that is a subclass of pandera.SchemaModel."
)
return super().__init_subclass__()

Expand Down Expand Up @@ -201,9 +195,9 @@ def validate(self, gdf: gpd.GeoDataFrame) -> ValidationResult:
schema_start = time.time()
if self.schema:
try:
self.schema.validate(gdf, lazy=True)
self.schema.validate(gdf, lazy_validation=True)
except pa.errors.SchemaErrors as err:
self.errors.append(err.failure_cases)
self.errors.append(err.failure_case)
schema_time = time.time() - schema_start

# Custom validation
Expand Down Expand Up @@ -262,93 +256,3 @@ def wrapper(gdf: gpd.GeoDataFrame = None, *args, **kwargs):
return wrapper

return decorator


no_na_check = Check.ne("NA", error="Value cannot be NA")

unique_check = Check(lambda s: s.is_unique, error="Should have all unique values")


def unique_value_check(lower: int, upper: int) -> Check:
return Check(
lambda s: s.nunique() >= lower and s.nunique() < upper,
error=f"Number of unique values is roughly between {lower} and {upper}",
)


def null_percentage_check(null_percent: float) -> Check:
return Check(
lambda s: s.isnull().mean() >= 0.8 * null_percent
and s.isnull().mean() <= 1.2 * null_percent,
error=f"Percentage of nulls in column should be roughly {null_percent}",
)


@dataclass
class DistributionParams:
min_value: Optional[int | float] = None
max_value: Optional[int | float] = None
mean: Optional[int | float] = None
median: Optional[int | float] = None
std: Optional[int | float] = None
q1: Optional[int | float] = None
q3: Optional[int | float] = None


def distribution_check(params: DistributionParams) -> List[Check]:
res = []

if params.min_value:
res.append(
Check(lambda s: pd.to_numeric(s, errors="coerce").min() >= params.min_value)
)
if params.max_value:
res.append(
Check(lambda s: pd.to_numeric(s, errors="coerce").max() <= params.max_value)
)
if params.mean:
res.append(
Check(
lambda s: pd.to_numeric(s, errors="coerce").mean() >= 0.8 * params.mean
and pd.to_numeric(s, errors="coerce").mean() <= 1.2 * params.mean,
error=f"Column mean should be roughly {params.mean}",
)
)
if params.median:
res.append(
Check(
lambda s: pd.to_numeric(s, errors="coerce").quantile(0.5)
>= 0.8 * params.median
and pd.to_numeric(s, errors="coerce").quantile(0.5)
<= 1.2 * params.median,
error=f"Column median should be roughly {params.median}",
)
)
if params.std:
res.append(
Check(
lambda s: pd.to_numeric(s, errors="coerce").std() >= 0.8 * params.std
and pd.to_numeric(s, errors="coerce").std() <= 1.2 * params.std,
error=f"Column standard deviation should be roughly {params.std}",
)
)
if params.q1:
res.append(
Check(
lambda s: pd.to_numeric(s, errors="coerce").quantile(0.25)
>= 0.8 * params.q1
and pd.to_numeric(s, errors="coerce").quantile(0.25) <= 1.2 * params.q1,
error=f"Column first quantile should be roughly {params.q1}",
)
)
if params.q3:
res.append(
Check(
lambda s: pd.to_numeric(s, errors="coerce").quantile(0.75)
>= 0.8 * params.q3
and pd.to_numeric(s, errors="coerce").quantile(0.75) <= 1.2 * params.q3,
error=f"Column third quantile should be roughly {params.q3}",
)
)

return res
6 changes: 1 addition & 5 deletions data/src/validation/community_gardens.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import geopandas as gpd
from pandera.pandas import Column, DataFrameSchema

from .base import BaseValidator

Expand All @@ -13,13 +12,10 @@ def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass


output_schema = DataFrameSchema({"vacant": Column(bool)})


class CommunityGardensOutputValidator(BaseValidator):
"""Validator for community gardens service output."""

schema = output_schema
schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass
7 changes: 1 addition & 6 deletions data/src/validation/conservatorship.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
import geopandas as gpd
from pandera.pandas import Check, Column, DataFrameSchema

from .base import BaseValidator

output_schema = DataFrameSchema(
{"tactical_urbanism": Column(str, checks=Check.isin(["Y", "N"]))}
)


class ConservatorshipOutputValidator(BaseValidator):
"""Validator for conservatorship service output."""

schema = output_schema
schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass
10 changes: 2 additions & 8 deletions data/src/validation/contig_neighbors.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,12 @@
import geopandas as gpd
from pandera.pandas import Column, DataFrameSchema

from .base import BaseValidator, DistributionParams, distribution_check

params = DistributionParams(max_value=49, mean=2.566, std=4.873, q1=0.000, q3=3.000)
output_schema = DataFrameSchema(
{"n_contiguous": Column(int, checks=[*distribution_check(params)], coerce=True)}
)
from .base import BaseValidator


class ContigNeighborsOutputValidator(BaseValidator):
"""Validator for contiguous neighbors service output."""

schema = output_schema
schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass
42 changes: 2 additions & 40 deletions data/src/validation/delinquencies.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import geopandas as gpd
from pandera.pandas import Check, Column, DataFrameSchema

from .base import BaseValidator, DistributionParams, distribution_check
from .base import BaseValidator


class DelinquenciesInputValidator(BaseValidator):
Expand All @@ -13,47 +12,10 @@ def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass


total_due_params = DistributionParams(
max_value=951046.42,
mean=7291.178875,
std=14821.81088,
q1=873.21,
q3=8301.53,
)
total_assessment_params = DistributionParams(
max_value=137576900,
mean=146337.2527,
std=1474304.277,
q1=29300,
q3=116800,
)
num_year_owed_params = DistributionParams(
max_value=45, mean=7.641, std=8.923, q1=2.000, q3=10.000
)

output_schema = DataFrameSchema(
{
"total_due": Column(
float, checks=[*distribution_check(total_due_params)], coerce=True
),
"most_recent_year_owed": Column(str),
"num_years_owed": Column(
int, checks=[*distribution_check(num_year_owed_params)], coerce=True
),
"payment_agreement": Column(bool, coerce=True),
"is_actionable": Column(bool),
"sheriff_sale": Column(str, checks=Check.isin(["Y", "N"])),
"total_assessment": Column(
float, checks=[*distribution_check(total_assessment_params)], coerce=True
),
}
)


class DelinquenciesOutputValidator(BaseValidator):
"""Validator for delinquencies service output."""

schema = output_schema
schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass
17 changes: 2 additions & 15 deletions data/src/validation/dev_probability.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import geopandas as gpd
from pandera.pandas import Check, Column, DataFrameSchema

from .base import BaseValidator, DistributionParams, distribution_check
from .base import BaseValidator


class DevProbabilityInputValidator(BaseValidator):
Expand All @@ -13,22 +12,10 @@ def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass


permit_counts_params = DistributionParams(
mean=42.129, std=44.789, max_value=413.000, q1=18.000, q3=46.000
)

output_schema = DataFrameSchema(
{
"permit_count": Column(int, checks=[*distribution_check(permit_counts_params)]),
"dev_rank": Column(str, checks=Check.isin(["Low", "Medium", "High"])),
}
)


class DevProbabilityOutputValidator(BaseValidator):
"""Validator for dev probability service output."""

schema = output_schema
schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass
Loading
Loading