Skip to content

Commit 81f7413

Browse files
committed
fix: rebase against staging, incorporate changes
2 parents bf73e7d + 511cfef commit 81f7413

16 files changed

+265
-29
lines changed

data/src/constants/services.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262

6363
DELINQUENCIES_QUERY = "SELECT * FROM real_estate_tax_delinquencies"
6464

65-
OPA_PROPERTIES_QUERY = "SELECT building_code_description, market_value, sale_date, sale_price, parcel_number, location as street_address,owner_1, owner_2, mailing_address_1, mailing_address_2, mailing_care_of, mailing_street, mailing_zip, mailing_city_state, unit, zip_code, zoning, the_geom FROM opa_properties_public"
65+
OPA_PROPERTIES_QUERY = "SELECT building_code_description, market_value, sale_date, sale_price, parcel_number, location AS street_address, owner_1, owner_2, mailing_address_1, mailing_address_2, mailing_care_of, mailing_street, mailing_zip, mailing_city_state, unit, zip_code, zoning, the_geom FROM opa_properties_public"
6666

6767
PWD_PARCELS_QUERY = "SELECT *, the_geom FROM pwd_parcels"
6868

data/src/data_utils/opa_properties.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,9 @@ def opa_properties(
209209
performance_logger.info(f"load_or_fetch completed: {load_time:.3f}s")
210210
performance_logger.info(f"Loaded {len(opa)} rows")
211211

212+
print(opa.columns)
213+
print(opa.head())
214+
212215
# Convert 'sale_price' and 'market_value' to numeric values
213216
numeric_start = time.time()
214217
performance_logger.info("Converting sale_price and market_value to numeric")
@@ -253,7 +256,7 @@ def opa_properties(
253256
performance_logger.info("Combining street_address and unit")
254257
opa["street_address"] = opa.apply(
255258
lambda row: f"{row['street_address']} {row['unit']}"
256-
if pd.notnull(row["unit"]) and str(row["unit"]).strip() != ""
259+
if pd.notnull(row.get("unit")) and str(row["unit"]).strip() != ""
257260
else row["street_address"],
258261
axis=1,
259262
)

data/src/test/validation/test_kde_validator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ def test_schema_missing_columns(schema_validator, sample_gdf):
193193

194194
result = schema_validator.validate(gdf_missing)
195195

196+
print("[DEBUG] result.errors:", result.errors)
196197
assert not result.success
197198
# Check for schema validation error (pandera generates different error format)
198199
assert any("Schema validation failed" in error for error in result.errors)

data/src/validation/access_process.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
import pandas as pd
33
import pandera.pandas as pa
44

5-
from .base import BaseValidator
5+
from .base import (
6+
BaseValidator,
7+
)
68

79
# Define the Access Process DataFrame Schema
810
AccessProcessSchema = pa.DataFrameSchema(
@@ -50,7 +52,7 @@ def _row_level_validation(self, gdf: gpd.GeoDataFrame, errors: list):
5052
required_columns = ["access_process"]
5153
self._validate_required_columns(gdf, required_columns, errors)
5254

53-
# Validate access_process column
55+
# Validate access_process column using utility functions
5456
if "access_process" in gdf.columns:
5557
# Check for non-string values (excluding NAs)
5658
non_string_access_processes = (
@@ -91,10 +93,8 @@ def _statistical_validation(self, gdf: gpd.GeoDataFrame, errors: list):
9193
f"Access process count ({total_records}) below expected minimum ({min_records:,})"
9294
)
9395

94-
# 2. Access process distribution validation
96+
# 2. Access process distribution validation using utility functions
9597
if "access_process" in gdf.columns:
96-
total_records = len(gdf)
97-
9898
# Check that we have some NAs (non-vacant properties)
9999
na_count = gdf["access_process"].isna().sum()
100100
na_pct = (na_count / total_records) * 100

data/src/validation/base.py

Lines changed: 110 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import logging
33
import time
44
from abc import ABC, abstractmethod
5-
from typing import Callable, List
5+
from dataclasses import dataclass
6+
from typing import Callable, List, Optional
67

78
import geopandas as gpd
89
import pandas as pd
@@ -367,11 +368,25 @@ def validate(
367368
print("\n[SCHEMA VALIDATION ERROR]")
368369
print("First 10 failure cases:")
369370
print(err.failure_cases.head(10).to_string(index=False))
370-
# Add each failure as a separate error
371+
# Add each failure as a custom error message
371372
for _, row in err.failure_cases.iterrows():
372-
self.errors.append(
373-
f"Schema validation failed for {row['column']}: {row['failure_case']}"
374-
)
373+
col = row.get("column", "")
374+
check = row.get("check", "")
375+
failure = row.get("failure_case", "")
376+
# Try to make the message as close as possible to your old custom ones
377+
if "mean should be roughly" in str(failure):
378+
msg = f"{col} mean appears outside expected range: {failure}"
379+
elif "standard deviation should be roughly" in str(failure):
380+
msg = f"{col} standard deviation appears outside expected range: {failure}"
381+
elif "max" in str(failure) or "min" in str(failure):
382+
msg = f"{col} values appear outside expected range: {failure}"
383+
elif "first quantile" in str(failure):
384+
msg = f"{col} first quantile appears outside expected range: {failure}"
385+
elif "third quantile" in str(failure):
386+
msg = f"{col} third quantile appears outside expected range: {failure}"
387+
else:
388+
msg = f"{col} failed check '{check}': {failure}"
389+
self.errors.append(f"Schema validation failed: {msg}")
375390
return ValidationResult(success=False, errors=self.errors.copy())
376391
schema_time = time.time() - schema_start
377392

@@ -685,3 +700,93 @@ def wrapper(gdf: gpd.GeoDataFrame = None, *args, **kwargs):
685700
return wrapper
686701

687702
return decorator
703+
704+
705+
no_na_check = Check.ne("NA", error="Value cannot be NA")
706+
707+
unique_check = Check(lambda s: s.is_unique, error="Should have all unique values")
708+
709+
710+
def unique_value_check(lower: int, upper: int) -> Check:
711+
return Check(
712+
lambda s: s.nunique() >= lower and s.nunique() < upper,
713+
error=f"Number of unique values is roughly between {lower} and {upper}",
714+
)
715+
716+
717+
def null_percentage_check(null_percent: float) -> Check:
718+
return Check(
719+
lambda s: s.isnull().mean() >= 0.8 * null_percent
720+
and s.isnull().mean() <= 1.2 * null_percent,
721+
error=f"Percentage of nulls in column should be roughly {null_percent}",
722+
)
723+
724+
725+
@dataclass
726+
class DistributionParams:
727+
min_value: Optional[int | float] = None
728+
max_value: Optional[int | float] = None
729+
mean: Optional[int | float] = None
730+
median: Optional[int | float] = None
731+
std: Optional[int | float] = None
732+
q1: Optional[int | float] = None
733+
q3: Optional[int | float] = None
734+
735+
736+
def distribution_check(params: DistributionParams) -> List[Check]:
737+
res = []
738+
739+
if params.min_value:
740+
res.append(
741+
Check(lambda s: pd.to_numeric(s, errors="coerce").min() >= params.min_value)
742+
)
743+
if params.max_value:
744+
res.append(
745+
Check(lambda s: pd.to_numeric(s, errors="coerce").max() <= params.max_value)
746+
)
747+
if params.mean:
748+
res.append(
749+
Check(
750+
lambda s: pd.to_numeric(s, errors="coerce").mean() >= 0.8 * params.mean
751+
and pd.to_numeric(s, errors="coerce").mean() <= 1.2 * params.mean,
752+
error=f"Column mean should be roughly {params.mean}",
753+
)
754+
)
755+
if params.median:
756+
res.append(
757+
Check(
758+
lambda s: pd.to_numeric(s, errors="coerce").quantile(0.5)
759+
>= 0.8 * params.median
760+
and pd.to_numeric(s, errors="coerce").quantile(0.5)
761+
<= 1.2 * params.median,
762+
error=f"Column median should be roughly {params.median}",
763+
)
764+
)
765+
if params.std:
766+
res.append(
767+
Check(
768+
lambda s: pd.to_numeric(s, errors="coerce").std() >= 0.8 * params.std
769+
and pd.to_numeric(s, errors="coerce").std() <= 1.2 * params.std,
770+
error=f"Column standard deviation should be roughly {params.std}",
771+
)
772+
)
773+
if params.q1:
774+
res.append(
775+
Check(
776+
lambda s: pd.to_numeric(s, errors="coerce").quantile(0.25)
777+
>= 0.8 * params.q1
778+
and pd.to_numeric(s, errors="coerce").quantile(0.25) <= 1.2 * params.q1,
779+
error=f"Column first quantile should be roughly {params.q1}",
780+
)
781+
)
782+
if params.q3:
783+
res.append(
784+
Check(
785+
lambda s: pd.to_numeric(s, errors="coerce").quantile(0.75)
786+
>= 0.8 * params.q3
787+
and pd.to_numeric(s, errors="coerce").quantile(0.75) <= 1.2 * params.q3,
788+
error=f"Column third quantile should be roughly {params.q3}",
789+
)
790+
)
791+
792+
return res

data/src/validation/community_gardens.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import geopandas as gpd
22
import pandera.pandas as pa
33

4-
from .base import BaseValidator
4+
from .base import (
5+
BaseValidator,
6+
)
57

68
# Define the Community Gardens DataFrame Schema
79
CommunityGardensSchema = pa.DataFrameSchema(
@@ -49,7 +51,7 @@ def _print_statistical_summary(self, gdf: gpd.GeoDataFrame):
4951
)
5052
print(f"Site names missing: {total_records - non_null_site_names:,}")
5153

52-
# Unique site names
54+
# Unique site names using utility function
5355
if "site_name" in gdf.columns:
5456
unique_site_names = gdf["site_name"].nunique()
5557
print(f"Unique site names: {unique_site_names:,}")
@@ -72,7 +74,7 @@ def _row_level_validation(self, gdf: gpd.GeoDataFrame, errors: list):
7274
required_columns = ["opa_id", "vacant", "geometry"]
7375
self._validate_required_columns(gdf, required_columns, errors)
7476

75-
# Validate vacant column is boolean
77+
# Validate vacant column is boolean using utility functions
7678
if "vacant" in gdf.columns:
7779
non_null_vacant = gdf["vacant"].dropna()
7880
if len(non_null_vacant) > 0:
@@ -92,7 +94,7 @@ def _statistical_validation(self, gdf: gpd.GeoDataFrame, errors: list):
9294
if total_records == 0:
9395
errors.append("Output dataset is empty")
9496

95-
# 2. Vacant column validation - check that some parcels are marked as non-vacant
97+
# 2. Vacant column validation using utility functions
9698
if "vacant" in gdf.columns:
9799
non_vacant_count = (~gdf["vacant"]).sum()
98100
vacant_count = gdf["vacant"].sum()
@@ -109,6 +111,10 @@ def _statistical_validation(self, gdf: gpd.GeoDataFrame, errors: list):
109111
"No vacant parcels found - this seems unlikely for a full property dataset"
110112
)
111113

114+
# Use utility function to validate boolean distribution
115+
# Expect roughly 2 unique values (True/False) for vacant column
116+
self._validate_unique_count(gdf, "vacant", errors, min_count=1, max_count=2)
117+
112118
def _print_statistical_summary(self, gdf: gpd.GeoDataFrame):
113119
"""Print comprehensive statistical summary of the community gardens data."""
114120
self._print_summary_header("Community Gardens Statistical Summary", gdf)
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
import geopandas as gpd
2+
from pandera.pandas import Check, Column, DataFrameSchema
23

34
from .base import BaseValidator
45

6+
output_schema = DataFrameSchema(
7+
{"tactical_urbanism": Column(str, checks=Check.isin(["Y", "N"]))}
8+
)
9+
510

611
class ConservatorshipOutputValidator(BaseValidator):
712
"""Validator for conservatorship service output."""
813

9-
schema = None
14+
schema = output_schema
1015

1116
def _custom_validation(self, gdf: gpd.GeoDataFrame):
1217
pass
Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
import geopandas as gpd
2+
from pandera.pandas import Column, DataFrameSchema
23

3-
from .base import BaseValidator
4+
from .base import BaseValidator, DistributionParams, distribution_check
5+
6+
params = DistributionParams(max_value=49, mean=2.566, std=4.873, q1=0.000, q3=3.000)
7+
output_schema = DataFrameSchema(
8+
{"n_contiguous": Column(int, checks=[*distribution_check(params)], coerce=True)}
9+
)
410

511

612
class ContigNeighborsOutputValidator(BaseValidator):
713
"""Validator for contiguous neighbors service output."""
814

9-
schema = None
15+
schema = output_schema
1016

1117
def _custom_validation(self, gdf: gpd.GeoDataFrame):
1218
pass

data/src/validation/delinquencies.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import geopandas as gpd
2+
from pandera.pandas import Check, Column, DataFrameSchema
23

3-
from .base import BaseValidator
4+
from .base import BaseValidator, DistributionParams, distribution_check
45

56

67
class DelinquenciesInputValidator(BaseValidator):
@@ -12,10 +13,47 @@ def _custom_validation(self, gdf: gpd.GeoDataFrame):
1213
pass
1314

1415

16+
total_due_params = DistributionParams(
17+
max_value=951046.42,
18+
mean=7291.178875,
19+
std=14821.81088,
20+
q1=873.21,
21+
q3=8301.53,
22+
)
23+
total_assessment_params = DistributionParams(
24+
max_value=137576900,
25+
mean=146337.2527,
26+
std=1474304.277,
27+
q1=29300,
28+
q3=116800,
29+
)
30+
num_year_owed_params = DistributionParams(
31+
max_value=45, mean=7.641, std=8.923, q1=2.000, q3=10.000
32+
)
33+
34+
output_schema = DataFrameSchema(
35+
{
36+
"total_due": Column(
37+
float, checks=[*distribution_check(total_due_params)], coerce=True
38+
),
39+
"most_recent_year_owed": Column(str),
40+
"num_years_owed": Column(
41+
int, checks=[*distribution_check(num_year_owed_params)], coerce=True
42+
),
43+
"payment_agreement": Column(bool, coerce=True),
44+
"is_actionable": Column(bool),
45+
"sheriff_sale": Column(str, checks=Check.isin(["Y", "N"])),
46+
"total_assessment": Column(
47+
float, checks=[*distribution_check(total_assessment_params)], coerce=True
48+
),
49+
}
50+
)
51+
52+
1553
class DelinquenciesOutputValidator(BaseValidator):
1654
"""Validator for delinquencies service output."""
1755

18-
schema = None
56+
schema = output_schema
1957

2058
def _custom_validation(self, gdf: gpd.GeoDataFrame):
2159
pass

data/src/validation/dev_probability.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import geopandas as gpd
2+
from pandera.pandas import Check, Column, DataFrameSchema
23

3-
from .base import BaseValidator
4+
from .base import BaseValidator, DistributionParams, distribution_check
45

56

67
class DevProbabilityInputValidator(BaseValidator):
@@ -12,10 +13,22 @@ def _custom_validation(self, gdf: gpd.GeoDataFrame):
1213
pass
1314

1415

16+
permit_counts_params = DistributionParams(
17+
mean=42.129, std=44.789, max_value=413.000, q1=18.000, q3=46.000
18+
)
19+
20+
output_schema = DataFrameSchema(
21+
{
22+
"permit_count": Column(int, checks=[*distribution_check(permit_counts_params)]),
23+
"dev_rank": Column(str, checks=Check.isin(["Low", "Medium", "High"])),
24+
}
25+
)
26+
27+
1528
class DevProbabilityOutputValidator(BaseValidator):
1629
"""Validator for dev probability service output."""
1730

18-
schema = None
31+
schema = output_schema
1932

2033
def _custom_validation(self, gdf: gpd.GeoDataFrame):
2134
pass

0 commit comments

Comments
 (0)