Skip to content

Commit 511cfef

Browse files
authored
Output Validators (#1254)
Defines the set of output validators for each data service using `DataFrameSchema` from `pandera`. Needs additional work and investigation to see where data or schema descriptions might be failing or insufficient.
1 parent 5a23066 commit 511cfef

27 files changed

+502
-47
lines changed

data/src/constants/services.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262

6363
DELINQUENCIES_QUERY = "SELECT * FROM real_estate_tax_delinquencies"
6464

65-
OPA_PROPERTIES_QUERY = "SELECT building_code_description, market_value, sale_date, sale_price, parcel_number, location as street_address,owner_1, owner_2, mailing_address_1, mailing_address_2, mailing_care_of, mailing_street, mailing_zip, mailing_city_state, unit, zip_code, zoning, the_geom FROM opa_properties_public"
65+
OPA_PROPERTIES_QUERY = "SELECT building_code_description, market_value, sale_date, sale_price, parcel_number, location AS street_address, owner_1, owner_2, mailing_address_1, mailing_address_2, mailing_care_of, mailing_street, mailing_zip, mailing_city_state, unit, zip_code, zoning, the_geom FROM opa_properties_public"
6666

6767
PWD_PARCELS_QUERY = "SELECT *, the_geom FROM pwd_parcels"
6868

data/src/data_utils/opa_properties.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,9 @@ def opa_properties(
207207
print(f"[OPA_PROPERTIES] load_or_fetch completed: {load_time:.3f}s")
208208
print(f"[OPA_PROPERTIES] Loaded {len(opa)} rows")
209209

210+
print(opa.columns)
211+
print(opa.head())
212+
210213
# Convert 'sale_price' and 'market_value' to numeric values
211214
numeric_start = time.time()
212215
print("[OPA_PROPERTIES] Converting sale_price and market_value to numeric")
@@ -238,7 +241,7 @@ def opa_properties(
238241
print("[OPA_PROPERTIES] Combining street_address and unit")
239242
opa["street_address"] = opa.apply(
240243
lambda row: f"{row['street_address']} {row['unit']}"
241-
if pd.notnull(row["unit"]) and str(row["unit"]).strip() != ""
244+
if pd.notnull(row.get("unit")) and str(row["unit"]).strip() != ""
242245
else row["street_address"],
243246
axis=1,
244247
)
Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,30 @@
11
import geopandas as gpd
2+
from pandera.pandas import Check, Column, DataFrameSchema
23

34
from .base import BaseValidator
45

6+
output_schema = DataFrameSchema(
7+
{
8+
"access_process": Column(
9+
str,
10+
checks=Check.isin(
11+
[
12+
"Private Land Use Agreement",
13+
"Go through Land Bank",
14+
"PRA",
15+
"Do Nothing",
16+
"Buy Property",
17+
]
18+
),
19+
)
20+
}
21+
)
22+
523

624
class AccessProcessOutputValidator(BaseValidator):
725
"""Validator for access process service output."""
826

9-
schema = None
27+
schema = output_schema
1028

1129
def _custom_validation(self, gdf: gpd.GeoDataFrame):
1230
pass

data/src/validation/base.py

Lines changed: 102 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22
import logging
33
import time
44
from abc import ABC
5-
from typing import Callable, List
5+
from dataclasses import dataclass
6+
from typing import Callable, List, Optional
67

78
import geopandas as gpd
9+
import pandas as pd
810
import pandera.pandas as pa
11+
from pandera import Check
912

1013
from src.config.config import USE_CRS
1114
from src.constants.city_limits import PHL_GEOMETRY
@@ -23,15 +26,16 @@ def __bool__(self):
2326
class BaseValidator(ABC):
2427
"""Base class for service-specific data validation."""
2528

26-
schema: pa.DataFrameModel = None
29+
schema: pa.DataFrameSchema = None
2730

2831
def __init_subclass__(cls):
2932
schema = getattr(cls, "schema", None)
30-
if schema is not None and (
31-
not isinstance(schema, type) or not isinstance(schema, pa.DataFrameModel)
32-
):
33+
if schema is not None and not isinstance(schema, pa.DataFrameSchema):
34+
print(type(schema))
35+
print(isinstance(schema, type))
36+
print(isinstance(schema, pa.DataFrameSchema))
3337
raise TypeError(
34-
f"{cls.__name__} must define a 'schema' class variable that is a subclass of pandera.SchemaModel."
38+
f"{cls.__name__} must define a 'schema' class variable that is an instance of pandera.DataFrameSchema."
3539
)
3640
return super().__init_subclass__()
3741

@@ -197,9 +201,9 @@ def validate(self, gdf: gpd.GeoDataFrame) -> ValidationResult:
197201
schema_start = time.time()
198202
if self.schema:
199203
try:
200-
self.schema.validate(gdf, lazy_validation=True)
204+
self.schema.validate(gdf, lazy=True)
201205
except pa.errors.SchemaErrors as err:
202-
self.errors.append(err.failure_case)
206+
self.errors.append(err.failure_cases)
203207
schema_time = time.time() - schema_start
204208

205209
# Custom validation
@@ -258,3 +262,93 @@ def wrapper(gdf: gpd.GeoDataFrame = None, *args, **kwargs):
258262
return wrapper
259263

260264
return decorator
265+
266+
267+
no_na_check = Check.ne("NA", error="Value cannot be NA")
268+
269+
unique_check = Check(lambda s: s.is_unique, error="Should have all unique values")
270+
271+
272+
def unique_value_check(lower: int, upper: int) -> Check:
273+
return Check(
274+
lambda s: s.nunique() >= lower and s.nunique() < upper,
275+
error=f"Number of unique values is roughly between {lower} and {upper}",
276+
)
277+
278+
279+
def null_percentage_check(null_percent: float) -> Check:
280+
return Check(
281+
lambda s: s.isnull().mean() >= 0.8 * null_percent
282+
and s.isnull().mean() <= 1.2 * null_percent,
283+
error=f"Percentage of nulls in column should be roughly {null_percent}",
284+
)
285+
286+
287+
@dataclass
288+
class DistributionParams:
289+
min_value: Optional[int | float] = None
290+
max_value: Optional[int | float] = None
291+
mean: Optional[int | float] = None
292+
median: Optional[int | float] = None
293+
std: Optional[int | float] = None
294+
q1: Optional[int | float] = None
295+
q3: Optional[int | float] = None
296+
297+
298+
def distribution_check(params: DistributionParams) -> List[Check]:
299+
res = []
300+
301+
if params.min_value:
302+
res.append(
303+
Check(lambda s: pd.to_numeric(s, errors="coerce").min() >= params.min_value)
304+
)
305+
if params.max_value:
306+
res.append(
307+
Check(lambda s: pd.to_numeric(s, errors="coerce").max() <= params.max_value)
308+
)
309+
if params.mean:
310+
res.append(
311+
Check(
312+
lambda s: pd.to_numeric(s, errors="coerce").mean() >= 0.8 * params.mean
313+
and pd.to_numeric(s, errors="coerce").mean() <= 1.2 * params.mean,
314+
error=f"Column mean should be roughly {params.mean}",
315+
)
316+
)
317+
if params.median:
318+
res.append(
319+
Check(
320+
lambda s: pd.to_numeric(s, errors="coerce").quantile(0.5)
321+
>= 0.8 * params.median
322+
and pd.to_numeric(s, errors="coerce").quantile(0.5)
323+
<= 1.2 * params.median,
324+
error=f"Column median should be roughly {params.median}",
325+
)
326+
)
327+
if params.std:
328+
res.append(
329+
Check(
330+
lambda s: pd.to_numeric(s, errors="coerce").std() >= 0.8 * params.std
331+
and pd.to_numeric(s, errors="coerce").std() <= 1.2 * params.std,
332+
error=f"Column standard deviation should be roughly {params.std}",
333+
)
334+
)
335+
if params.q1:
336+
res.append(
337+
Check(
338+
lambda s: pd.to_numeric(s, errors="coerce").quantile(0.25)
339+
>= 0.8 * params.q1
340+
and pd.to_numeric(s, errors="coerce").quantile(0.25) <= 1.2 * params.q1,
341+
error=f"Column first quantile should be roughly {params.q1}",
342+
)
343+
)
344+
if params.q3:
345+
res.append(
346+
Check(
347+
lambda s: pd.to_numeric(s, errors="coerce").quantile(0.75)
348+
>= 0.8 * params.q3
349+
and pd.to_numeric(s, errors="coerce").quantile(0.75) <= 1.2 * params.q3,
350+
error=f"Column third quantile should be roughly {params.q3}",
351+
)
352+
)
353+
354+
return res

data/src/validation/community_gardens.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import geopandas as gpd
2+
from pandera.pandas import Column, DataFrameSchema
23

34
from .base import BaseValidator
45

@@ -12,10 +13,13 @@ def _custom_validation(self, gdf: gpd.GeoDataFrame):
1213
pass
1314

1415

16+
output_schema = DataFrameSchema({"vacant": Column(bool)})
17+
18+
1519
class CommunityGardensOutputValidator(BaseValidator):
1620
"""Validator for community gardens service output."""
1721

18-
schema = None
22+
schema = output_schema
1923

2024
def _custom_validation(self, gdf: gpd.GeoDataFrame):
2125
pass
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
import geopandas as gpd
2+
from pandera.pandas import Check, Column, DataFrameSchema
23

34
from .base import BaseValidator
45

6+
output_schema = DataFrameSchema(
7+
{"tactical_urbanism": Column(str, checks=Check.isin(["Y", "N"]))}
8+
)
9+
510

611
class ConservatorshipOutputValidator(BaseValidator):
712
"""Validator for conservatorship service output."""
813

9-
schema = None
14+
schema = output_schema
1015

1116
def _custom_validation(self, gdf: gpd.GeoDataFrame):
1217
pass
Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
import geopandas as gpd
2+
from pandera.pandas import Column, DataFrameSchema
23

3-
from .base import BaseValidator
4+
from .base import BaseValidator, DistributionParams, distribution_check
5+
6+
params = DistributionParams(max_value=49, mean=2.566, std=4.873, q1=0.000, q3=3.000)
7+
output_schema = DataFrameSchema(
8+
{"n_contiguous": Column(int, checks=[*distribution_check(params)], coerce=True)}
9+
)
410

511

612
class ContigNeighborsOutputValidator(BaseValidator):
713
"""Validator for contiguous neighbors service output."""
814

9-
schema = None
15+
schema = output_schema
1016

1117
def _custom_validation(self, gdf: gpd.GeoDataFrame):
1218
pass

data/src/validation/delinquencies.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import geopandas as gpd
2+
from pandera.pandas import Check, Column, DataFrameSchema
23

3-
from .base import BaseValidator
4+
from .base import BaseValidator, DistributionParams, distribution_check
45

56

67
class DelinquenciesInputValidator(BaseValidator):
@@ -12,10 +13,47 @@ def _custom_validation(self, gdf: gpd.GeoDataFrame):
1213
pass
1314

1415

16+
total_due_params = DistributionParams(
17+
max_value=951046.42,
18+
mean=7291.178875,
19+
std=14821.81088,
20+
q1=873.21,
21+
q3=8301.53,
22+
)
23+
total_assessment_params = DistributionParams(
24+
max_value=137576900,
25+
mean=146337.2527,
26+
std=1474304.277,
27+
q1=29300,
28+
q3=116800,
29+
)
30+
num_year_owed_params = DistributionParams(
31+
max_value=45, mean=7.641, std=8.923, q1=2.000, q3=10.000
32+
)
33+
34+
output_schema = DataFrameSchema(
35+
{
36+
"total_due": Column(
37+
float, checks=[*distribution_check(total_due_params)], coerce=True
38+
),
39+
"most_recent_year_owed": Column(str),
40+
"num_years_owed": Column(
41+
int, checks=[*distribution_check(num_year_owed_params)], coerce=True
42+
),
43+
"payment_agreement": Column(bool, coerce=True),
44+
"is_actionable": Column(bool),
45+
"sheriff_sale": Column(str, checks=Check.isin(["Y", "N"])),
46+
"total_assessment": Column(
47+
float, checks=[*distribution_check(total_assessment_params)], coerce=True
48+
),
49+
}
50+
)
51+
52+
1553
class DelinquenciesOutputValidator(BaseValidator):
1654
"""Validator for delinquencies service output."""
1755

18-
schema = None
56+
schema = output_schema
1957

2058
def _custom_validation(self, gdf: gpd.GeoDataFrame):
2159
pass

data/src/validation/dev_probability.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import geopandas as gpd
2+
from pandera.pandas import Check, Column, DataFrameSchema
23

3-
from .base import BaseValidator
4+
from .base import BaseValidator, DistributionParams, distribution_check
45

56

67
class DevProbabilityInputValidator(BaseValidator):
@@ -12,10 +13,22 @@ def _custom_validation(self, gdf: gpd.GeoDataFrame):
1213
pass
1314

1415

16+
permit_counts_params = DistributionParams(
17+
mean=42.129, std=44.789, max_value=413.000, q1=18.000, q3=46.000
18+
)
19+
20+
output_schema = DataFrameSchema(
21+
{
22+
"permit_count": Column(int, checks=[*distribution_check(permit_counts_params)]),
23+
"dev_rank": Column(str, checks=Check.isin(["Low", "Medium", "High"])),
24+
}
25+
)
26+
27+
1528
class DevProbabilityOutputValidator(BaseValidator):
1629
"""Validator for dev probability service output."""
1730

18-
schema = None
31+
schema = output_schema
1932

2033
def _custom_validation(self, gdf: gpd.GeoDataFrame):
2134
pass

0 commit comments

Comments
 (0)