Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions data/src/data_utils/community_gardens.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def community_gardens(
name="Community Gardens",
esri_urls=COMMUNITY_GARDENS_TO_LOAD,
cols=["site_name"],
validator=CommunityGardensInputValidator(),
)

community_gardens, input_validation = loader.load_or_fetch()
Expand Down
6 changes: 5 additions & 1 deletion data/src/data_utils/imm_dang_buildings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import geopandas as gpd

from src.validation.base import ValidationResult, validate_output
from src.validation.imm_dang_buildings import ImmDangerOutputValidator
from src.validation.imm_dang_buildings import (
ImmDangerInputValidator,
ImmDangerOutputValidator,
)

from ..classes.loaders import CartoLoader
from ..constants.services import IMMINENT_DANGER_BUILDINGS_QUERY
Expand Down Expand Up @@ -45,6 +48,7 @@ def imm_dang_buildings(
name="Imminently Dangerous Buildings",
carto_queries=IMMINENT_DANGER_BUILDINGS_QUERY,
opa_col="opa_account_num",
validator=ImmDangerInputValidator(),
)

imm_dang_buildings, input_validation = loader.load_or_fetch()
Expand Down
6 changes: 5 additions & 1 deletion data/src/data_utils/li_violations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import pandas as pd

from src.validation.base import ValidationResult, validate_output
from src.validation.li_violations import LIViolationsOutputValidator
from src.validation.li_violations import (
LIViolationsOutputValidator,
LIViolationsInputValidator,
)

from ..classes.loaders import CartoLoader
from ..constants.services import VIOLATIONS_SQL_QUERY
Expand Down Expand Up @@ -59,6 +62,7 @@ def li_violations(
name="LI Violations",
carto_queries=VIOLATIONS_SQL_QUERY,
opa_col="opa_account_num",
validator=LIViolationsInputValidator(),
)

l_and_i_violations, input_validation = loader.load_or_fetch()
Expand Down
10 changes: 8 additions & 2 deletions data/src/data_utils/phs_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import geopandas as gpd

from src.validation.base import ValidationResult, validate_output
from src.validation.phs_properties import PHSPropertiesOutputValidator
from src.validation.phs_properties import (
PHSPropertiesInputValidator,
PHSPropertiesOutputValidator,
)

from ..classes.loaders import EsriLoader
from ..constants.services import PHS_LAYERS_TO_LOAD
Expand Down Expand Up @@ -42,7 +45,10 @@ def phs_properties(
print(input_gdf.head())

loader = EsriLoader(
name="PHS Properties", esri_urls=PHS_LAYERS_TO_LOAD, cols=["program"]
name="PHS Properties",
esri_urls=PHS_LAYERS_TO_LOAD,
cols=["program"],
validator=PHSPropertiesInputValidator(),
)

phs_properties, input_validation = loader.load_or_fetch()
Expand Down
6 changes: 5 additions & 1 deletion data/src/data_utils/ppr_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
import requests

from src.validation.base import ValidationResult, validate_output
from src.validation.ppr_properties import PPRPropertiesOutputValidator
from src.validation.ppr_properties import (
PPRPropertiesInputValidator,
PPRPropertiesOutputValidator,
)

from ..classes.loaders import EsriLoader, GdfLoader
from ..constants.services import PPR_PROPERTIES_TO_LOAD
Expand Down Expand Up @@ -73,6 +76,7 @@ def ppr_properties(
input=io.BytesIO(response.content),
name="PPR Properties",
cols=["public_name"],
validator=PPRPropertiesInputValidator(),
)
ppr_properties, input_validation = loader.load_or_fetch()

Expand Down
6 changes: 5 additions & 1 deletion data/src/data_utils/pwd_parcels.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import geopandas as gpd

from src.validation.base import ValidationResult, validate_output
from src.validation.pwd_parcels import PWDParcelsOutputValidator
from src.validation.pwd_parcels import (
PWDParcelsInputValidator,
PWDParcelsOutputValidator,
)

from ..classes.loaders import CartoLoader
from ..constants.services import PWD_PARCELS_QUERY
Expand Down Expand Up @@ -122,6 +125,7 @@ def pwd_parcels(
name="PWD Parcels",
carto_queries=PWD_PARCELS_QUERY,
opa_col="brt_id",
validator=PWDParcelsInputValidator(),
)

pwd_parcels, input_validation = loader.load_or_fetch()
Expand Down
6 changes: 4 additions & 2 deletions data/src/data_utils/rco_geoms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd

from src.validation.base import ValidationResult, validate_output
from src.validation.rco_geoms import RCOGeomsOutputValidator
from src.validation.rco_geoms import RCOGeomsOutputValidator, RCOGeomsInputValidator

from ..classes.loaders import EsriLoader
from ..constants.services import RCOS_LAYERS_TO_LOAD
Expand Down Expand Up @@ -45,7 +45,9 @@ def rco_geoms(input_gdf: gpd.GeoDataFrame) -> Tuple[gpd.GeoDataFrame, Validation
Primary Feature Layer Columns Referenced:
opa_id, geometry
"""
loader = EsriLoader(name="RCOs", esri_urls=RCOS_LAYERS_TO_LOAD)
loader = EsriLoader(
name="RCOs", esri_urls=RCOS_LAYERS_TO_LOAD, validator=RCOGeomsInputValidator()
)
rco_geoms, input_validation = loader.load_or_fetch()

logger.debug(f"RCO data loaded: {len(rco_geoms)} RCO records")
Expand Down
6 changes: 5 additions & 1 deletion data/src/data_utils/unsafe_buildings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import geopandas as gpd

from src.validation.base import ValidationResult, validate_output
from src.validation.unsafe_buildings import UnsafeBuildingsOutputValidator
from src.validation.unsafe_buildings import (
UnsafeBuildingsOutputValidator,
UnsafeBuildingsInputValidator,
)

from ..classes.loaders import CartoLoader
from ..constants.services import UNSAFE_BUILDINGS_QUERY
Expand Down Expand Up @@ -44,6 +47,7 @@ def unsafe_buildings(
name="Unsafe Buildings",
carto_queries=UNSAFE_BUILDINGS_QUERY,
opa_col="opa_account_num",
validator=UnsafeBuildingsInputValidator(),
)

unsafe_buildings, input_validation = loader.load_or_fetch()
Expand Down
6 changes: 5 additions & 1 deletion data/src/data_utils/vacant_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

from src.config.config import ROOT_DIRECTORY
from src.validation.base import ValidationResult, validate_output
from src.validation.vacant_properties import VacantPropertiesOutputValidator
from src.validation.vacant_properties import (
VacantPropertiesInputValidator,
VacantPropertiesOutputValidator,
)

from ..classes.loaders import EsriLoader, google_cloud_bucket
from ..constants.services import VACANT_PROPS_LAYERS_TO_LOAD
Expand Down Expand Up @@ -126,6 +129,7 @@ def vacant_properties(
name="Vacant Properties",
esri_urls=VACANT_PROPS_LAYERS_TO_LOAD,
cols=["opa_id", "parcel_type"],
validator=VacantPropertiesInputValidator(),
)

vacant_properties, input_validation = loader.load_or_fetch()
Expand Down
20 changes: 20 additions & 0 deletions data/src/validation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,26 @@ def null_percentage_check(null_percent: float) -> Check:
)


def row_count_check(reference_count: int, tolerance: float = 0.1) -> Check:
"""
Create a check that validates if the DataFrame's row count is within a specified tolerance range.

Args:
reference_count: The expected number of rows
tolerance: The allowed deviation as a percentage (default 10%)

Returns:
Check: A pandera Check object that validates row count
"""
lower_bound = reference_count * (1 - tolerance)
upper_bound = reference_count * (1 + tolerance)

return Check(
lambda df: df.shape[0] >= lower_bound and df.shape[0] <= upper_bound,
error=f"DataFrame size must be between {int(lower_bound)} and {int(upper_bound)} rows (±{tolerance * 100}% from {reference_count}).",
)


@dataclass
class DistributionParams:
min_value: Optional[int | float] = None
Expand Down
20 changes: 7 additions & 13 deletions data/src/validation/city_owned_properties.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import geopandas as gpd
import pandera.pandas as pa

from .base import BaseValidator
from .base import BaseValidator, row_count_check

# Define the City Owned Properties DataFrame Schema
CityOwnedPropertiesSchema = pa.DataFrameSchema(
Expand Down Expand Up @@ -29,11 +29,8 @@
coerce=True,
)

# Expecting ~7,796 records returned (within ±20% tolerance).
# This is checked in CityOwnedPropertiesInputSchema
expected = 7796
lower = int(expected * 0.8)
upper = int(expected * 1.2)
# Reference count for city owned properties
CITY_OWNED_REFERENCE_COUNT = 7796

CityOwnedPropertiesInputSchema = pa.DataFrameSchema(
columns={
Expand All @@ -44,20 +41,17 @@
),
"geometry": pa.Column("geometry"),
},
checks=pa.Check(lambda df: lower <= df.shape[0] <= upper),
strict=True,
checks=row_count_check(CITY_OWNED_REFERENCE_COUNT, tolerance=0.1),
strict=False,
)


class CityOwnedPropertiesInputValidator(BaseValidator):
"""
Validator for the city-owned properties dataset input.
schema and _custom_validation() are used by validate() in the parent class.
"""
"""Validator for city owned properties service input."""

schema = CityOwnedPropertiesInputSchema

def _custom_validation(self, gdf: gpd.GeoDataFrame):
def _custom_validation(self, gdf: gpd.GeoDataFrame, check_stats: bool = True):
pass


Expand Down
18 changes: 14 additions & 4 deletions data/src/validation/community_gardens.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import geopandas as gpd
import pandera.pandas as pa

from .base import (
BaseValidator,
)
from .base import BaseValidator, row_count_check

# Define the Community Gardens DataFrame Schema
CommunityGardensSchema = pa.DataFrameSchema(
Expand All @@ -23,11 +21,23 @@
coerce=True,
)

# Reference count for community gardens
COMMUNITY_GARDENS_REFERENCE_COUNT = 205

CommunityGardensInputSchema = pa.DataFrameSchema(
columns={
"geometry": pa.Column("geometry"),
"site_name": pa.Column(str, nullable=True),
},
checks=row_count_check(COMMUNITY_GARDENS_REFERENCE_COUNT, tolerance=0.1),
strict=False,
)


class CommunityGardensInputValidator(BaseValidator):
"""Validator for community gardens service input."""

schema = None # No schema validation for input
schema = CommunityGardensInputSchema

def _custom_validation(self, gdf: gpd.GeoDataFrame, check_stats: bool = True):
pass
Expand Down
2 changes: 1 addition & 1 deletion data/src/validation/contig_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class ContigNeighborsInputValidator(BaseValidator):

schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
def _custom_validation(self, gdf: gpd.GeoDataFrame, check_stats: bool = True):
pass


Expand Down
2 changes: 1 addition & 1 deletion data/src/validation/council_dists.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
lambda df: set(df["district"].dropna().unique())
== {str(i) for i in range(1, 11)}
),
strict=True,
strict=False,
)


Expand Down
2 changes: 1 addition & 1 deletion data/src/validation/delinquencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class DelinquenciesInputValidator(BaseValidator):

schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
def _custom_validation(self, gdf: gpd.GeoDataFrame, check_stats: bool = True):
pass


Expand Down
4 changes: 2 additions & 2 deletions data/src/validation/dev_probability.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@


class DevProbabilityInputValidator(BaseValidator):
"""Validator for dev probability service input from census block groups."""
"""Validator for development probability service input."""

schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
def _custom_validation(self, gdf: gpd.GeoDataFrame, check_stats: bool = True):
pass


Expand Down
2 changes: 1 addition & 1 deletion data/src/validation/dor_parcels.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class DorParcelsInputValidator(BaseValidator):

schema = None

def _custom_validation(self, gdf: gpd.GeoDataFrame):
def _custom_validation(self, gdf: gpd.GeoDataFrame, check_stats: bool = True):
pass


Expand Down
16 changes: 14 additions & 2 deletions data/src/validation/imm_dang_buildings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandera.pandas as pa
from pandera import Check

from .base import BaseValidator
from .base import BaseValidator, row_count_check

# Define the Imminently Dangerous Buildings DataFrame Schema
ImmDangerBuildingsSchema = pa.DataFrameSchema(
Expand Down Expand Up @@ -31,11 +31,23 @@
strict=False,
)

# Reference count for imminently dangerous buildings
IMM_DANGER_BUILDINGS_REFERENCE_COUNT = 186

ImmDangerBuildingsInputSchema = pa.DataFrameSchema(
columns={
"opa_id": pa.Column(pa.String, checks=pa.Check(lambda s: s.dropna() != "")),
"geometry": pa.Column("geometry"),
},
checks=row_count_check(IMM_DANGER_BUILDINGS_REFERENCE_COUNT, tolerance=0.1),
strict=False,
)


class ImmDangerInputValidator(BaseValidator):
"""Validator for imminent danger buildings service input."""

schema = None
schema = ImmDangerBuildingsInputSchema

def _custom_validation(self, gdf: gpd.GeoDataFrame, check_stats: bool = True):
pass
Expand Down
Loading
Loading