Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion data/src/data_utils/city_owned_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import geopandas as gpd

from src.validation.base import ValidationResult, validate_output
from src.validation.city_owned_properties import CityOwnedPropertiesOutputValidator
from src.validation.city_owned_properties import (
CityOwnedPropertiesOutputValidator,
CityOwnedPropertiesInputValidator,
)

from ..classes.loaders import EsriLoader
from ..constants.services import CITY_OWNED_PROPERTIES_TO_LOAD
Expand Down Expand Up @@ -47,6 +50,7 @@ def city_owned_properties(
esri_urls=CITY_OWNED_PROPERTIES_TO_LOAD,
cols=["OPABRT", "AGENCY", "SIDEYARDELIGIBLE"],
opa_col="opabrt",
validator=CityOwnedPropertiesInputValidator(),
)

city_owned_properties, input_validation = loader.load_or_fetch()
Expand Down
10 changes: 8 additions & 2 deletions data/src/data_utils/council_dists.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import pandas as pd

from src.validation.base import ValidationResult, validate_output
from src.validation.council_dists import CouncilDistrictsOutputValidator
from src.validation.council_dists import (
CouncilDistrictsOutputValidator,
CouncilDistrictsInputValidator,
)

from ..classes.loaders import EsriLoader
from ..constants.services import COUNCIL_DISTRICTS_TO_LOAD
Expand Down Expand Up @@ -39,7 +42,10 @@ def council_dists(
"""

loader = EsriLoader(
name="Council Districts", esri_urls=COUNCIL_DISTRICTS_TO_LOAD, cols=["district"]
name="Council Districts",
esri_urls=COUNCIL_DISTRICTS_TO_LOAD,
cols=["district"],
validator=CouncilDistrictsInputValidator(),
)

council_dists, input_validation = loader.load_or_fetch()
Expand Down
2 changes: 1 addition & 1 deletion data/src/validation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __bool__(self):
class BaseValidator(ABC):
"""Base class for service-specific data validation."""

schema: pa.DataFrameSchema = None
schema = None

def __init_subclass__(cls):
schema = getattr(cls, "schema", None)
Expand Down
63 changes: 58 additions & 5 deletions data/src/validation/city_owned_properties.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,74 @@
import geopandas as gpd

import pandera.pandas as pa
import pandas as pd
from .base import BaseValidator

# Expecting ~7,796 records returned (within ±20% tolerance).
# This is checked in CityOwnedPropertiesInputSchema
expected = 7796
lower = int(expected * 0.8)
upper = int(expected * 1.2)

CityOwnedPropertiesInputSchema = pa.DataFrameSchema(
columns={
"opa_id": pa.Column(pa.Int, checks=pa.Check(lambda s: s.dropna() != "")),
"agency": pa.Column(pa.String, nullable=True),
"sideyardeligible": pa.Column(
pa.Category, nullable=True, checks=pa.Check.isin(["Yes", "No"])
),
"geometry": pa.Column("geometry"),
},
checks=pa.Check(lambda df: lower <= df.shape[0] <= upper),
strict=True,
)

CityOwnedPropertiesOutputSchema = pa.DataFrameSchema(
columns={
"opa_id": pa.Column(pa.Int, checks=pa.Check(lambda s: s.dropna() != "")),
"market_value": pa.Column(pa.Int, nullable=True),
"sale_date": pa.Column(pd.DatetimeTZDtype(tz="UTC"), nullable=True),
"sale_price": pa.Column(pa.Float, nullable=True),
"owner_1": pa.Column(pa.String, nullable=True),
"owner_2": pa.Column(pa.String, nullable=True),
"building_code_description": pa.Column(pa.String, nullable=True),
"zip_code": pa.Column(pa.String, nullable=True),
"zoning": pa.Column(pa.String, nullable=True),
"parcel_type": pa.Column(pa.String, nullable=True),
"standardized_address": pa.Column(pa.String, nullable=True),
"vacant": pa.Column(pa.Bool, nullable=True),
"district": pa.Column(pa.String, nullable=True),
"neighborhood": pa.Column(pa.String, nullable=True),
"rco_info": pa.Column(pa.String, nullable=True),
"rco_names": pa.Column(pa.String, nullable=True),
"city_owner_agency": pa.Column(pa.String, nullable=True),
"side_yard_eligible": pa.Column(
pa.Category, nullable=True, checks=pa.Check.isin(["Yes", "No"])
),
"geometry": pa.Column("geometry"),
},
strict=True,
)


class CityOwnedPropertiesInputValidator(BaseValidator):
"""Validator for access city owned properties service input."""
"""
Validator for the city-owned properties dataset input.
schema and _custom_validation() are used by validate() in the parent class.
"""

schema = None
schema = CityOwnedPropertiesInputSchema

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass


class CityOwnedPropertiesOutputValidator(BaseValidator):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these should be going on the CityOwnedPropertiesInputValidator since this is defining a schema for the incoming data. The ...OutputValidators are used to monitor the accumulated, constructed dataset in the pipeline after each service is called, so they should have an evolving schema that depends on the particular columns that are being changed or added progressively by the services.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Where exactly should CityOwnedPropertiesInputValidator be called? I see in your framework setup you installed decorators with OutputValidators for each data_util, but InputValidators aren't being called.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea the input validators need to be called slightly differently since they're coming in on the data loader. The input validators should be passed in to the respective loader class for each of the data services seen in the class definition here. Then the validation is called whenever data loading occurs here and the result is passed out.

"""Validator for city owned properties service output."""
"""
Validator for the city-owned properties dataset output.
schema and _custom_validation() are used by validate() in the parent class.
"""

schema = None
schema = CityOwnedPropertiesOutputSchema

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass
45 changes: 42 additions & 3 deletions data/src/validation/council_dists.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,51 @@
import geopandas as gpd

import pandera.pandas as pa
import pandas as pd
from .base import BaseValidator

CouncilDistrictsInputSchema = pa.DataFrameSchema(
columns={
"district": pa.Column(
str,
nullable=True,
),
"geometry": pa.Column("geometry"),
},
# district should contain 10 records of strings 1-10
checks=pa.Check(
lambda df: set(df["district"].dropna().unique())
== {str(i) for i in range(1, 11)}
),
strict=True,
)

CouncilDistrictsOutputSchema = pa.DataFrameSchema(
columns={
"opa_id": pa.Column(pa.String),
"street_address": pa.Column(pa.String, nullable=True),
"market_value": pa.Column(pa.Int, nullable=True),
"sale_date": pa.Column(pd.DatetimeTZDtype(tz="UTC"), nullable=True),
"sale_price": pa.Column(pa.Float, nullable=True),
"owner_1": pa.Column(pa.String, nullable=True),
"owner_2": pa.Column(pa.String, nullable=True),
"building_code_description": pa.Column(pa.String, nullable=True),
"zip_code": pa.Column(pa.String, nullable=True),
"zoning": pa.Column(pa.String, nullable=True),
"parcel_type": pa.Column(pa.String, nullable=True),
"vacant": pa.Column(pa.Bool, nullable=True),
"district": pa.Column(
str, nullable=True, checks=pa.Check.isin([str(i) for i in range(1, 11)])
),
"geometry": pa.Column("geometry"),
},
strict=True,
)


class CouncilDistrictsInputValidator(BaseValidator):
"""Validator for council districts service input."""

schema = None
schema = CouncilDistrictsInputSchema

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass
Expand All @@ -15,7 +54,7 @@ def _custom_validation(self, gdf: gpd.GeoDataFrame):
class CouncilDistrictsOutputValidator(BaseValidator):
"""Validator for council districts service output."""

schema = None
schema = CouncilDistrictsOutputSchema

def _custom_validation(self, gdf: gpd.GeoDataFrame):
pass
Loading