diff --git a/data/src/config/config.py b/data/src/config/config.py index 61abf9e0..0c843306 100644 --- a/data/src/config/config.py +++ b/data/src/config/config.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from pathlib import Path -FORCE_RELOAD = True +FORCE_RELOAD = False """ During the data load, whether to query the various GIS API services for the data to load. If True, will query the API services and report on data differences. If false will read the cached data.""" diff --git a/data/src/data_utils/access_process.py b/data/src/data_utils/access_process.py index cebaec0f..e86633a2 100644 --- a/data/src/data_utils/access_process.py +++ b/data/src/data_utils/access_process.py @@ -3,11 +3,13 @@ import geopandas as gpd import pandas as pd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.access_process import AccessProcessOutputValidator from src.validation.base import ValidationResult, validate_output @validate_output(AccessProcessOutputValidator) +@provide_metadata(current_metadata=current_metadata) def access_process( dataset: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: @@ -30,7 +32,7 @@ def access_process( access_process (str): The access process for each property based on city ownership and market value. Will be NA for non-vacant properties. - Primary Feature Layer Columns Referenced: + Columns referenced: city_owner_agency, market_value, vacant Side Effects: diff --git a/data/src/data_utils/city_owned_properties.py b/data/src/data_utils/city_owned_properties.py index 666dcbf9..c671fe67 100644 --- a/data/src/data_utils/city_owned_properties.py +++ b/data/src/data_utils/city_owned_properties.py @@ -3,10 +3,11 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.city_owned_properties import ( - CityOwnedPropertiesOutputValidator, CityOwnedPropertiesInputValidator, + CityOwnedPropertiesOutputValidator, ) from ..classes.loaders import EsriLoader @@ -17,27 +18,28 @@ @validate_output(CityOwnedPropertiesOutputValidator) +@provide_metadata(current_metadata=current_metadata) def city_owned_properties( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Processes city-owned property data by joining it with the primary feature layer, + Processes city-owned property data by joining it with the input dataframe, renaming columns, and updating access information for properties based on ownership. All instances where the "city_owner_agency" is "PLB" are changed to "Land Bank (PHDC)". Args: - primary_featurelayer (FeatureLayer): The primary feature layer to which city-owned + input_gdf (GeoDataFrame): The GeoDataFrame to which city-owned property data will be joined. Returns: - FeatureLayer: The updated primary feature layer with processed city ownership + GeoDataFrame: The updated GeoDataFrame with processed city ownership information. Columns added: city_owner_agency (str): The agency that owns the city property. side_yard_eligible (bool): Indicates if the property is eligible for the side yard program. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, owner_1, owner2 Tagline: diff --git a/data/src/data_utils/community_gardens.py b/data/src/data_utils/community_gardens.py index 61ad1b30..ecf276d4 100644 --- a/data/src/data_utils/community_gardens.py +++ b/data/src/data_utils/community_gardens.py @@ -3,6 +3,7 @@ import geopandas as gpd from src.constants.city_limits import PHL_GEOMETRY +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.community_gardens import ( CommunityGardensInputValidator, @@ -15,19 +16,20 @@ @validate_output(CommunityGardensOutputValidator) +@provide_metadata(current_metadata=current_metadata) def community_gardens( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Updates the 'vacant' column in the primary feature layer to ensure community gardens + Updates the 'vacant' column in the input dataframe to ensure community gardens are marked as not vacant. This protects known community gardens from being categorized as vacant, preventing potential predatory development. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The input GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with the 'vacant' column updated to False + GeoDataFrame: The input GeoDataFrame with the 'vacant' column updated to False for parcels containing community gardens. Tagline: @@ -36,7 +38,7 @@ def community_gardens( Columns updated: vacant: Updated to False for parcels containing community gardens. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, vacant Source: diff --git a/data/src/data_utils/conservatorship.py b/data/src/data_utils/conservatorship.py index 947112af..ef70fad4 100644 --- a/data/src/data_utils/conservatorship.py +++ b/data/src/data_utils/conservatorship.py @@ -5,6 +5,7 @@ import pytz from dateutil.parser import parse +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.conservatorship import ConservatorshipOutputValidator @@ -15,26 +16,27 @@ @validate_output(ConservatorshipOutputValidator) +@provide_metadata(current_metadata=current_metadata) def conservatorship( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Determines conservatorship eligibility for properties in a feature layer. + Determines conservatorship eligibility for properties in a GeoDataFrame. Args: - primary_featurelayer (FeatureLayer): A feature layer containing property data in a GeoDataFrame (`gdf`). + input_gdf (GeoDataFrame): A GeoDataFrame containing property data in a GeoDataFrame (`gdf`). Columns Added: conservatorship (bool): Indicates whether each property qualifies for conservatorship (True or False). - Primary Feature Layer Columns Referenced: + Columns referenced: city_owner_agency, sheriff_sale, market_value, all_violations_past_year, sale_date Tagline: Identify conservatorship-eligible properties Returns: - FeatureLayer: The input feature layer with an added "conservatorship" column indicating + GeoDataFrame: The input GeoDataFrame with an added "conservatorship" column indicating whether each property qualifies for conservatorship (True or False). """ conservatorships = [] diff --git a/data/src/data_utils/contig_neighbors.py b/data/src/data_utils/contig_neighbors.py index 9ae6b268..c08ee0a7 100644 --- a/data/src/data_utils/contig_neighbors.py +++ b/data/src/data_utils/contig_neighbors.py @@ -6,6 +6,7 @@ import numpy as np from libpysal.weights import Queen +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.contig_neighbors import ContigNeighborsOutputValidator @@ -13,17 +14,18 @@ @validate_output(ContigNeighborsOutputValidator) +@provide_metadata(current_metadata=current_metadata) def contig_neighbors( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Calculates the number of contiguous vacant neighbors for each property in a feature layer. + Calculates the number of contiguous vacant neighbors for each property in a GeoDataFrame. Args: - primary_featurelayer (FeatureLayer): A feature layer containing property data in a GeoDataFrame (`gdf`). + input_gdf: A input GeoDataFrame containing property data in a GeoDataFrame (`gdf`). Returns: - FeatureLayer: The input feature layer with an added "n_contiguous" column indicating + GeoDataFrame: The input GeoDataFrame with an added "n_contiguous" column indicating the number of contiguous vacant neighbors for each property. Tagline: @@ -32,7 +34,7 @@ def contig_neighbors( Columns Added: n_contiguous (int): The number of contiguous vacant neighbors for each property. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, vacant """ print(f"[DEBUG] contig_neighbors: Starting with {len(input_gdf)} properties") @@ -192,7 +194,7 @@ def contig_neighbors( f"[DEBUG] contig_neighbors: vacant_parcels opa_ids in input_gdf: {len(matching_opa_ids)} / {len(vacant_opa_ids)}" ) - # Merge the results back to the primary feature layer + # Merge the results back to the input GeoDataFrame input_gdf = opa_join(input_gdf, vacant_parcels[["opa_id", "n_contiguous"]]) # Debug: Check what's in input_gdf after join diff --git a/data/src/data_utils/council_dists.py b/data/src/data_utils/council_dists.py index 80dad4d8..4b28dc0d 100644 --- a/data/src/data_utils/council_dists.py +++ b/data/src/data_utils/council_dists.py @@ -3,6 +3,7 @@ import geopandas as gpd import pandas as pd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.council_dists import ( CouncilDistrictsInputValidator, @@ -17,18 +18,19 @@ @validate_output(CouncilDistrictsOutputValidator) +@provide_metadata(current_metadata=current_metadata) def council_dists( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Associates properties in the primary feature layer with council districts + Associates properties in the input GeoDataFrame with council districts using a spatial join. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with properties spatially joined + GeoDataFrame: The input GeoDataFrame with properties spatially joined to council districts, ensuring no duplicate entries. Tagline: @@ -37,7 +39,7 @@ def council_dists( Columns added: district (str): The council district associated with the property. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, geometry """ @@ -74,7 +76,7 @@ def council_dists( merged_gdf = spatial_join(input_gdf, council_dists, predicate="within") - # Drop duplicates in the primary feature layer + # Drop duplicates in the input GeoDataFrame merged_gdf.drop_duplicates(inplace=True) # Debug: Check for duplicate OPA IDs and show what's causing them diff --git a/data/src/data_utils/delinquencies.py b/data/src/data_utils/delinquencies.py index 87c16cab..ce504301 100644 --- a/data/src/data_utils/delinquencies.py +++ b/data/src/data_utils/delinquencies.py @@ -3,6 +3,7 @@ import geopandas as gpd import pandas as pd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.delinquencies import DelinquenciesOutputValidator @@ -12,18 +13,19 @@ @validate_output(DelinquenciesOutputValidator) +@provide_metadata(current_metadata=current_metadata) def delinquencies( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Adds property tax delinquency information to the primary feature layer by + Adds property tax delinquency information to the input GeoDataFrame by joining with a tax delinquencies dataset. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with added columns for tax delinquency + GeoDataFrame: The input GeoDataFrame with added columns for tax delinquency information, including total due, actionable status, payment agreements, and more. Tagline: @@ -41,7 +43,7 @@ def delinquencies( sheriff_sale (bool): Indicates if the property is at risk of sheriff sale. total_assessment (float): Total property assessment. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id """ diff --git a/data/src/data_utils/dev_probability.py b/data/src/data_utils/dev_probability.py index 2bc69764..cbab69cd 100644 --- a/data/src/data_utils/dev_probability.py +++ b/data/src/data_utils/dev_probability.py @@ -6,6 +6,7 @@ import requests from src.config.config import USE_CRS +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.dev_probability import DevProbabilityOutputValidator @@ -15,19 +16,20 @@ @validate_output(DevProbabilityOutputValidator) +@provide_metadata(current_metadata=current_metadata) def dev_probability( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ Calculates development probability based on permit counts and assigns development ranks to census block groups. The results are joined to the - primary feature layer. + input GeoDataFrame. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with added spatial join data for + GeoDataFrame: The input GeoDataFrame with added spatial join data for development probability and ranks. Tagline: @@ -37,7 +39,7 @@ def dev_probability( permit_count (int): The number of permits issued in the census block group. dev_rank (str): The development rank of the census block group. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, geometry Source: diff --git a/data/src/data_utils/drug_crimes.py b/data/src/data_utils/drug_crimes.py index ffe1b2db..ba0b9396 100644 --- a/data/src/data_utils/drug_crimes.py +++ b/data/src/data_utils/drug_crimes.py @@ -3,6 +3,7 @@ import geopandas as gpd from src.data_utils.kde import apply_kde_to_input +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.drug_crimes import DrugCrimesOutputValidator @@ -10,17 +11,18 @@ @validate_output(DrugCrimesOutputValidator) +@provide_metadata(current_metadata=current_metadata) def drug_crimes( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer. + Applies kernel density estimation (KDE) analysis for drug crimes to the input GeoDataFrame. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with KDE analysis results for drug crimes. + GeoDataFrame: The input GeoDataFrame with KDE analysis results for drug crimes. Tagline: Density analysis for drug crimes @@ -31,7 +33,7 @@ def drug_crimes( drug_crimes_density_label (str): Categorized density level. drug_crimes_density_percentile (float): Percentile rank of density. - Primary Feature Layer Columns Referenced: + Columns referenced: geometry Source: diff --git a/data/src/data_utils/gun_crimes.py b/data/src/data_utils/gun_crimes.py index e41f5d2d..13841b15 100644 --- a/data/src/data_utils/gun_crimes.py +++ b/data/src/data_utils/gun_crimes.py @@ -3,6 +3,7 @@ import geopandas as gpd from src.data_utils.kde import apply_kde_to_input +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.gun_crimes import GunCrimesOutputValidator @@ -10,17 +11,18 @@ @validate_output(GunCrimesOutputValidator) +@provide_metadata(current_metadata=current_metadata) def gun_crimes( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Applies kernel density estimation (KDE) analysis for gun crimes to the primary feature layer. + Applies kernel density estimation (KDE) analysis for gun crimes to the input GeoDataFrame. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with KDE analysis results for gun crimes. + GeoDataFrame: The input GeoDataFrame with KDE analysis results for gun crimes. Tagline: Analyzes gun crime density @@ -31,7 +33,7 @@ def gun_crimes( gun_crimes_density_label (str): Categorized density level. gun_crimes_density_percentile (float): Percentile rank of density. - Primary Feature Layer Columns Referenced: + Columns referenced: geometry Source: diff --git a/data/src/data_utils/imm_dang_buildings.py b/data/src/data_utils/imm_dang_buildings.py index 1de0cc78..fc4ffd43 100644 --- a/data/src/data_utils/imm_dang_buildings.py +++ b/data/src/data_utils/imm_dang_buildings.py @@ -3,6 +3,7 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.imm_dang_buildings import ImmDangerOutputValidator @@ -14,18 +15,19 @@ @validate_output(ImmDangerOutputValidator) +@provide_metadata(current_metadata=current_metadata) def imm_dang_buildings( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Adds information about imminently dangerous buildings to the primary feature layer + Adds information about imminently dangerous buildings to the input GeoDataFrame by joining with a dataset of dangerous buildings. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with an added "imm_dang_building" column, + GeoDataFrame: The input GeoDataFrame with an added "imm_dang_building" column, indicating whether each property is categorized as imminently dangerous ("Y" or "N"). Tagline: @@ -34,7 +36,7 @@ def imm_dang_buildings( Columns Added: imm_dang_building (bool): Indicates whether each property is categorized as imminently dangerous (True or False). - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id Source: @@ -110,7 +112,7 @@ def imm_dang_buildings( f"Deduplicated imminently dangerous buildings: {before_dedup} -> {after_dedup} records (removed {before_dedup - after_dedup} duplicates)" ) - # Join imminently dangerous buildings data with primary feature layer + # Join imminently dangerous buildings data with input GeoDataFrame merged_gdf = opa_join( input_gdf, imm_dang_buildings, diff --git a/data/src/data_utils/kde.py b/data/src/data_utils/kde.py index c35915b1..3c1ddf8d 100644 --- a/data/src/data_utils/kde.py +++ b/data/src/data_utils/kde.py @@ -138,7 +138,7 @@ def generic_kde( Returns: Tuple[str, np.ndarray]: The raster filename and the array of input points. """ - performance_logger.info(f"Initializing FeatureLayer for {name}") + performance_logger.info(f"Initializing GeoDataFrame for {name}") # Profile data loading with profile_section("Data Loading"): @@ -250,18 +250,18 @@ def apply_kde_to_input( batch_size: int = batch_size, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Applies KDE to the primary feature layer and adds columns for density, z-score, + Applies KDE to the input GeoDataFrame and adds columns for density, z-score, percentile, and percentile as a string. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. name (str): Name of the KDE feature. query (str): SQL query to fetch data for KDE. resolution (int): Resolution for the KDE raster grid. batch_size (int): Batch size for processing grid points. Returns: - FeatureLayer: The input feature layer with added KDE-related columns. + GeoDataFrame: The input GeoDataFrame with added KDE-related columns. """ raster_filename, crime_coords, input_validation = generic_kde( name, query, resolution, batch_size diff --git a/data/src/data_utils/li_complaints.py b/data/src/data_utils/li_complaints.py index 1d9244cd..54fa007c 100644 --- a/data/src/data_utils/li_complaints.py +++ b/data/src/data_utils/li_complaints.py @@ -2,6 +2,7 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.li_complaints import LIComplaintsOutputValidator @@ -10,17 +11,18 @@ @validate_output(LIComplaintsOutputValidator) +@provide_metadata(current_metadata=current_metadata) def li_complaints( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Applies kernel density estimation (KDE) analysis for L&I complaints to the primary feature layer. + Applies kernel density estimation (KDE) analysis for L&I complaints to the input GeoDataFrame. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with KDE analysis results for L&I complaints, + GeoDataFrame: The input GeoDataFrame with KDE analysis results for L&I complaints, including density and derived metrics. Tagline: @@ -32,7 +34,7 @@ def li_complaints( l_and_i_complaints_density_label (str): Categorized density level. l_and_i_complaints_density_percentile (float): Percentile rank of density. - Primary Feature Layer Columns Referenced: + Columns referenced: geometry Source: diff --git a/data/src/data_utils/li_violations.py b/data/src/data_utils/li_violations.py index 2fc8b8de..6d664ba0 100644 --- a/data/src/data_utils/li_violations.py +++ b/data/src/data_utils/li_violations.py @@ -3,6 +3,7 @@ import geopandas as gpd import pandas as pd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.li_violations import LIViolationsOutputValidator @@ -12,6 +13,7 @@ @validate_output(LIViolationsOutputValidator) +@provide_metadata(current_metadata=current_metadata) def li_violations( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: @@ -19,14 +21,14 @@ def li_violations( Process L&I (Licenses and Inspections) data for violations. This function filters and processes L&I violations data, - joining it with the primary feature layer based on spatial relationships + joining it with the input GeoDataFrame based on spatial relationships and OPA (Office of Property Assessment) identifiers. Args: - primary_featurelayer (FeatureLayer): The primary feature layer to join L&I data to. + input_gdf (GeoDataFrame): The input GeoDataFrame to join L&I data to. Returns: - FeatureLayer: The primary feature layer updated with L&I data. + GeoDataFrame: The input GeoDataFrame updated with L&I data. Tagline: Counts L&I violations @@ -38,7 +40,7 @@ def li_violations( Source: https://phl.carto.com/api/v2/sql - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id """ keywords: List[str] = [ diff --git a/data/src/data_utils/nbhoods.py b/data/src/data_utils/nbhoods.py index 894f0ed1..be241738 100644 --- a/data/src/data_utils/nbhoods.py +++ b/data/src/data_utils/nbhoods.py @@ -2,6 +2,7 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.nbhoods import NeighborhoodsOutputValidator @@ -11,16 +12,17 @@ @validate_output(NeighborhoodsOutputValidator) +@provide_metadata(current_metadata=current_metadata) def nbhoods(input_gdf: gpd.GeoDataFrame) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Adds neighborhood information to the primary feature layer by performing a spatial join + Adds neighborhood information to the input GeoDataFrame by performing a spatial join with a neighborhoods dataset. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with an added "neighborhood" column, + GeoDataFrame: The input GeoDataFrame with an added "neighborhood" column, containing the name of the neighborhood for each property. Tagline: @@ -29,7 +31,7 @@ def nbhoods(input_gdf: gpd.GeoDataFrame) -> Tuple[gpd.GeoDataFrame, ValidationRe Columns added: neighborhood (str): The name of the neighborhood associated with the property. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, geometry Source: diff --git a/data/src/data_utils/negligent_devs.py b/data/src/data_utils/negligent_devs.py index 2c26d2f6..6a4910e0 100644 --- a/data/src/data_utils/negligent_devs.py +++ b/data/src/data_utils/negligent_devs.py @@ -2,21 +2,23 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.negligent_devs import NegligentDevsOutputValidator @validate_output(NegligentDevsOutputValidator) +@provide_metadata(current_metadata=current_metadata) def negligent_devs( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ Identifies negligent developers based on the number of vacant properties owned and calculates the average number of L&I violations per distinct owner. - Flags negligent developers in the primary feature layer. + Flags negligent developers in the input GeoDataFrame. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Columns Added: negligent_dev (bool): non-city owned entities owning 5+ vacant properties @@ -26,14 +28,14 @@ def negligent_devs( per year for that developer (not limited to open violations or open properties) - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, vacant, city_owner_agency, standardized_mailing_address, all_violations_past_year Tagline: Identify negligent developers Returns: - FeatureLayer: The input feature layer with additional columns for total properties + GeoDataFrame: The input GeoDataFrame with additional columns for total properties owned, vacant properties owned, average violations per property, and a "negligent_dev" flag. """ # Count total properties and vacant properties by standardized_mailing_address diff --git a/data/src/data_utils/opa_properties.py b/data/src/data_utils/opa_properties.py index e50e6a6f..3ba48c7b 100644 --- a/data/src/data_utils/opa_properties.py +++ b/data/src/data_utils/opa_properties.py @@ -7,6 +7,7 @@ from src.classes.loaders import CartoLoader from src.config.config import get_logger +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.opa_properties import OPAPropertiesOutputValidator @@ -139,8 +140,9 @@ def create_standardized_mailing_address_vectorized(gdf: gpd.GeoDataFrame) -> pd. @validate_output(OPAPropertiesOutputValidator) +@provide_metadata(current_metadata=current_metadata) def opa_properties( - gdf: gpd.GeoDataFrame = None, + gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ Loads and processes OPA property data, standardizing addresses and cleaning geometries. diff --git a/data/src/data_utils/owner_type.py b/data/src/data_utils/owner_type.py index ba97597a..f73387f3 100644 --- a/data/src/data_utils/owner_type.py +++ b/data/src/data_utils/owner_type.py @@ -3,16 +3,18 @@ import geopandas as gpd import pandas as pd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.owner_type import OwnerTypeOutputValidator @validate_output(OwnerTypeOutputValidator) +@provide_metadata(current_metadata=current_metadata) def owner_type( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Determines the ownership type for each property in the primary feature layer based on + Determines the ownership type for each property in the input GeoDataFrame based on the 'owner_1', 'owner_2', 'city_owner_agency', and 'standardized_mailing_address' columns. The ownership type is set as: - "Public" if 'city_owner_agency' is not NA or if the mailing address matches specific @@ -24,10 +26,10 @@ def owner_type( - "Individual" if none of the above conditions are met. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property ownership data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property ownership data. Returns: - FeatureLayer: The updated feature layer with the 'owner_type' column added. + GeoDataFrame: The updated GeoDataFrame with the 'owner_type' column added. Tagline: Assigns ownership types @@ -36,7 +38,7 @@ def owner_type( owner_type (str): The ownership type of the property: Public, Nonprofit/Civic, Business (LLC), or Individual. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, owner_1, owner_2, city_owner_agency, standardized_mailing_address """ owner_types = [] diff --git a/data/src/data_utils/park_priority.py b/data/src/data_utils/park_priority.py index 76a45061..9c1d4e52 100644 --- a/data/src/data_utils/park_priority.py +++ b/data/src/data_utils/park_priority.py @@ -3,6 +3,7 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.park_priority import ParkPriorityOutputValidator @@ -116,6 +117,7 @@ def _park_priority_logic( @validate_output(ParkPriorityOutputValidator) +@provide_metadata(current_metadata=current_metadata) def park_priority( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: @@ -139,7 +141,7 @@ def park_priority( Columns Added: park_priority (float): The park priority score from TPL's analysis. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, geometry Source: diff --git a/data/src/data_utils/phs_properties.py b/data/src/data_utils/phs_properties.py index f141484b..4facf88a 100644 --- a/data/src/data_utils/phs_properties.py +++ b/data/src/data_utils/phs_properties.py @@ -2,6 +2,7 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.phs_properties import PHSPropertiesOutputValidator @@ -11,19 +12,20 @@ @validate_output(PHSPropertiesOutputValidator) +@provide_metadata(current_metadata=current_metadata) def phs_properties( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Perform a spatial join between the primary feature layer and the PHS properties layer, - then update the primary feature layer with a new column 'phs_care_program' indicating + Perform a spatial join between the input GeoDataFrame and the PHS properties layer, + then update the input GeoDataFrame with a new column 'phs_care_program' indicating if the property is part of the PHS care program. Args: - merged_gdf (FeatureLayer): The primary feature layer to join with the PHS properties layer. + merged_gdf (GeoDataFrame): The input GeoDataFrame to join with the PHS properties layer. Returns: - FeatureLayer: The updated primary feature layer with the 'phs_care_program' column. + GeoDataFrame: The updated input GeoDataFrame with the 'phs_care_program' column. Tagline: Identifies PHS Care properties @@ -31,7 +33,7 @@ def phs_properties( Columns added: phs_care_program (str): The PHS care program associated with the property. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, geometry """ @@ -63,7 +65,7 @@ def phs_properties( ) print(f"PHS properties after deduplication: {len(phs_properties)} records") - # Perform spatial join between primary feature layer and PHS properties + # Perform spatial join between input GeoDataFrame and PHS properties merged_gdf = spatial_join(input_gdf, phs_properties) print(f"After spatial join: {len(merged_gdf)} records") diff --git a/data/src/data_utils/ppr_properties.py b/data/src/data_utils/ppr_properties.py index 27ee9529..5a4d4c85 100644 --- a/data/src/data_utils/ppr_properties.py +++ b/data/src/data_utils/ppr_properties.py @@ -4,6 +4,7 @@ import geopandas as gpd import requests +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.ppr_properties import PPRPropertiesOutputValidator @@ -13,19 +14,20 @@ @validate_output(PPRPropertiesOutputValidator) +@provide_metadata(current_metadata=current_metadata) def ppr_properties( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Updates the 'vacant' column in the primary feature layer to ensure PPR properties + Updates the 'vacant' column in the input GeoDataFrame to ensure PPR properties are marked as not vacant. This prevents PPR properties from being miscategorized as vacant. Args: - primary_featurelayer (FeatureLayer): The primary feature layer to update. + input_gdf (GeoDataFrame): The input GeoDataFrame to update. Returns: - FeatureLayer: The updated primary feature layer. + GeoDataFrame: The updated input GeoDataFrame. Columns Updated: vacant: Updated to False for PPR properties. @@ -41,7 +43,7 @@ def ppr_properties( will fall back to loading the data from a GeoJSON URL https://opendata.arcgis.com/datasets/d52445160ab14380a673e5849203eb64_0.geojson - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, geometry, vacant, public_name """ fallback_url = "https://opendata.arcgis.com/datasets/d52445160ab14380a673e5849203eb64_0.geojson" @@ -76,7 +78,7 @@ def ppr_properties( ) ppr_properties, input_validation = loader.load_or_fetch() - # Perform a spatial join with the primary feature layer + # Perform a spatial join with the input GeoDataFrame merged_gdf = spatial_join(input_gdf, ppr_properties) # Remove duplicate OPA IDs in the main dataset after spatial join @@ -89,10 +91,10 @@ def ppr_properties( ) print(f"Main dataset after deduplication: {len(merged_gdf)} records") - # Ensure the 'vacant' column exists in the primary feature layer + # Ensure the 'vacant' column exists in the input GeoDataFrame if "vacant" not in merged_gdf.columns: raise ValueError( - "The 'vacant' column is missing in the primary feature layer. Ensure it exists before running this function." + "The 'vacant' column is missing in the input GeoDataFrame. Ensure it exists before running this function." ) # Create a mask for rows where PPR properties are identified diff --git a/data/src/data_utils/priority_level.py b/data/src/data_utils/priority_level.py index 9852e85b..9f386a9b 100644 --- a/data/src/data_utils/priority_level.py +++ b/data/src/data_utils/priority_level.py @@ -3,11 +3,13 @@ import geopandas as gpd import pandas as pd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.priority_level import PriorityLevelOutputValidator @validate_output(PriorityLevelOutputValidator) +@provide_metadata(current_metadata=current_metadata) def priority_level( dataset: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: @@ -16,10 +18,10 @@ def priority_level( violations, tree canopy gaps, and PHS Landcare status. Args: - dataset (FeatureLayer): A feature layer containing property data. + dataset (GeoDataFrame): A GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with an added "priority_level" column, + GeoDataFrame: The input GeoDataFrame with an added "priority_level" column, indicating the priority for each property as "Low", "Medium", or "High". Columns Added: @@ -31,7 +33,7 @@ def priority_level( Source: gun_crimes_density_zscore, all_violations_past_year, l_and_i_complaints_density_zscore, - tree_canopy_gap, phs_care_program columns in the primary feature layer. + tree_canopy_gap, phs_care_program columns in the input GeoDataFrame. """ priority_levels = [] for idx, row in dataset.iterrows(): diff --git a/data/src/data_utils/pwd_parcels.py b/data/src/data_utils/pwd_parcels.py index 3b4940c9..603ae2fc 100644 --- a/data/src/data_utils/pwd_parcels.py +++ b/data/src/data_utils/pwd_parcels.py @@ -2,6 +2,7 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.pwd_parcels import PWDParcelsOutputValidator @@ -27,11 +28,11 @@ def merge_pwd_parcels_gdf( primary_gdf: gpd.GeoDataFrame, pwd_parcels_gdf: gpd.GeoDataFrame ) -> gpd.GeoDataFrame: """ - Merge geometries from PWD parcels into the primary feature layer. + Merge geometries from PWD parcels into the input GeoDataFrame. Identifies condominium units by checking for "CONDO" in building_code_description. Args: - primary_gdf (GeoDataFrame): The primary feature layer + primary_gdf (GeoDataFrame): The input GeoDataFrame pwd_parcels_gdf (GeoDataFrame): The PWD parcels GeoDataFrame Returns: @@ -52,9 +53,9 @@ def merge_pwd_parcels_gdf( primary_gdf.loc[condo_building_mask, "is_condo_unit"] = True # Join geometries from PWD parcels for non-condo units only - # Temporarily drop geometry from the primary feature layer + # Temporarily drop geometry from the input GeoDataFrame - # Filter PWD parcels to just the opa_ids in primary + # Filter PWD parcels to just the opa_ids in input opa_ids_in_primary = primary_gdf["opa_id"].unique() pwd_subset = pwd_parcels_gdf[pwd_parcels_gdf["opa_id"].isin(opa_ids_in_primary)] @@ -83,19 +84,20 @@ def merge_pwd_parcels_gdf( @validate_output(PWDParcelsOutputValidator) +@provide_metadata(current_metadata=current_metadata) def pwd_parcels( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Updates the primary feature layer by replacing its geometry column with validated + Updates the input GeoDataFrame by replacing its geometry column with validated geometries from PWD parcels data. Retains point geometry for rows with no polygon geometry available. Identifies and flags condominium units. Args: - primary_featurelayer (FeatureLayer): The primary feature layer to update. + input_gdf (GeoDataFrame): The input GeoDataFrame to update. Returns: - FeatureLayer: The updated primary feature layer with geometries replaced + GeoDataFrame: The updated input GeoDataFrame with geometries replaced by those from PWD parcels or retained from the original layer if no match. Columns Added: @@ -109,7 +111,7 @@ def pwd_parcels( geometry: The geometry column is updated with validated geometries from PWD parcels. Condo units retain their original point geometries. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, geometry Tagline: diff --git a/data/src/data_utils/rco_geoms.py b/data/src/data_utils/rco_geoms.py index 0327841c..66dfa0df 100644 --- a/data/src/data_utils/rco_geoms.py +++ b/data/src/data_utils/rco_geoms.py @@ -4,6 +4,7 @@ import geopandas as gpd import pandas as pd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.rco_geoms import RCOGeomsOutputValidator @@ -17,16 +18,17 @@ @validate_output(RCOGeomsOutputValidator) +@provide_metadata(current_metadata=current_metadata) def rco_geoms(input_gdf: gpd.GeoDataFrame) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Adds Registered Community Organization (RCO) information to the primary feature layer + Adds Registered Community Organization (RCO) information to the input GeoDataFrame by performing a spatial join and aggregating RCO data. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with added RCO-related columns, + GeoDataFrame: The input GeoDataFrame with added RCO-related columns, including aggregated RCO information and names. Tagline: @@ -42,7 +44,7 @@ def rco_geoms(input_gdf: gpd.GeoDataFrame) -> Tuple[gpd.GeoDataFrame, Validation Notes: Modifies various columns. Fillna and infer_objects is applied to most columns. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, geometry """ loader = EsriLoader(name="RCOs", esri_urls=RCOS_LAYERS_TO_LOAD) diff --git a/data/src/data_utils/tactical_urbanism.py b/data/src/data_utils/tactical_urbanism.py index 364eb5c6..2a236bd1 100644 --- a/data/src/data_utils/tactical_urbanism.py +++ b/data/src/data_utils/tactical_urbanism.py @@ -2,34 +2,36 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.tactical_urbanism import TacticalUrbanismOutputValidator @validate_output(TacticalUrbanismOutputValidator) +@provide_metadata(current_metadata=current_metadata) def tactical_urbanism( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Assigns a 'tactical_urbanism' value to each row in the primary feature layer based on specific conditions. + Assigns a 'tactical_urbanism' value to each row in the input GeoDataFrame based on specific conditions. Tactical urbanism is marked as True if the property is a parcel of type 'Land', and does not have any unsafe or immediately dangerous buildings. Otherwise, it is False. Args: - primary_featurelayer: A FeatureLayer object containing a GeoDataFrame (`gdf`) as an attribute. + primary_featurelayer: A GeoDataFrame object containing a GeoDataFrame (`gdf`) as an attribute. Columns Added: tactical_urbanism (bool): Indicates whether each property qualifies for tactical urbanism (True or False). - Primary Feature Layer Columns Referenced: + Columns referenced: parcel_type, unsafe_building, imm_dang_building Tagline: Identify tactical urbanism-eligible properties Returns: - The input FeatureLayer with a new column 'tactical_urbanism' added to its GeoDataFrame. + The input GeoDataFrame with a new column 'tactical_urbanism' added to its GeoDataFrame. """ tactical_urbanism_values = [] diff --git a/data/src/data_utils/tree_canopy.py b/data/src/data_utils/tree_canopy.py index 40ca661a..adbaac07 100644 --- a/data/src/data_utils/tree_canopy.py +++ b/data/src/data_utils/tree_canopy.py @@ -7,6 +7,7 @@ import requests from src.classes.file_manager import FileManager, LoadType +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.tree_canopy import TreeCanopyOutputValidator @@ -17,18 +18,19 @@ @validate_output(TreeCanopyOutputValidator) +@provide_metadata(current_metadata=current_metadata) def tree_canopy( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Adds tree canopy gap information to the primary feature layer by downloading, + Adds tree canopy gap information to the input GeoDataFrame by downloading, processing, and spatially joining tree canopy data for Philadelphia County. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with an added "tree_canopy_gap" column + GeoDataFrame: The input GeoDataFrame with an added "tree_canopy_gap" column indicating the tree canopy gap for each property. Tagline: @@ -37,7 +39,7 @@ def tree_canopy( Columns added: tree_canopy_gap (float): The amount of tree canopy lacking. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id, geometry Source: diff --git a/data/src/data_utils/unsafe_buildings.py b/data/src/data_utils/unsafe_buildings.py index b52a7a71..58f38f02 100644 --- a/data/src/data_utils/unsafe_buildings.py +++ b/data/src/data_utils/unsafe_buildings.py @@ -3,6 +3,7 @@ import geopandas as gpd +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.unsafe_buildings import UnsafeBuildingsOutputValidator @@ -14,18 +15,19 @@ @validate_output(UnsafeBuildingsOutputValidator) +@provide_metadata(current_metadata=current_metadata) def unsafe_buildings( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Adds unsafe building information to the primary feature layer by joining with a dataset + Adds unsafe building information to the input GeoDataFrame by joining with a dataset of unsafe buildings. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with an added "unsafe_building" column, + GeoDataFrame: The input GeoDataFrame with an added "unsafe_building" column, indicating whether each property is categorized as an unsafe building ("Y" or "N"). Tagline: @@ -34,7 +36,7 @@ def unsafe_buildings( Columns Added: unsafe_building (bool): Indicates whether each property is categorized as an unsafe building (True or False). - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id Source: @@ -101,7 +103,7 @@ def unsafe_buildings( f"Deduplicated unsafe buildings: {before_dedup} -> {after_dedup} records (removed {before_dedup - after_dedup} duplicates)" ) - # Join unsafe buildings data with primary feature layer + # Join unsafe buildings data with input GeoDataFrame merged_gdf = opa_join(input_gdf, unsafe_buildings) # Fill missing values with False for non-unsafe buildings and convert to boolean diff --git a/data/src/data_utils/vacant_properties.py b/data/src/data_utils/vacant_properties.py index bb6e210e..4f6b00db 100644 --- a/data/src/data_utils/vacant_properties.py +++ b/data/src/data_utils/vacant_properties.py @@ -6,6 +6,7 @@ import pandas as pd from src.config.config import ROOT_DIRECTORY +from src.metadata.metadata_utils import current_metadata, provide_metadata from src.validation.base import ValidationResult, validate_output from src.validation.vacant_properties import VacantPropertiesOutputValidator @@ -96,18 +97,19 @@ def check_null_percentage(df: pd.DataFrame, threshold: float = 0.05) -> None: @validate_output(VacantPropertiesOutputValidator) +@provide_metadata(current_metadata=current_metadata) def vacant_properties( input_gdf: gpd.GeoDataFrame, ) -> Tuple[gpd.GeoDataFrame, ValidationResult]: """ - Adds a "vacant" column to the primary feature layer based on vacant property data from + Adds a "vacant" column to the input GeoDataFrame based on vacant property data from ESRI layers and backup data from local geoparquet files if necessary. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The GeoDataFrame containing property data. Returns: - FeatureLayer: The input feature layer with an added "vacant" column. + GeoDataFrame: The input GeoDataFrame with an added "vacant" column. Tagline: Identify vacant properties. @@ -115,7 +117,7 @@ def vacant_properties( Columns Added: vacant (bool): Indicates whether the property is vacant. - Primary Feature Layer Columns Referenced: + Columns referenced: opa_id Known Issues: @@ -208,7 +210,7 @@ def vacant_properties( # Final check for null percentages check_null_percentage(df) - # Add "vacant" column to primary feature layer + # Add "vacant" column to input GeoDataFrame input_gdf["vacant"] = input_gdf["opa_id"].isin(df["opa_id"]) # Drop parcel_type column after processing diff --git a/data/src/main.py b/data/src/main.py index f04a150f..5baefe0f 100644 --- a/data/src/main.py +++ b/data/src/main.py @@ -3,6 +3,7 @@ import sys import traceback +import geopandas as gpd import pandas as pd from src.classes.data_diff import DiffReport @@ -42,6 +43,7 @@ unsafe_buildings, vacant_properties, ) +from src.metadata.metadata_utils import current_metadata file_manager = FileManager() token = os.getenv("CAGP_SLACK_API_TOKEN") @@ -97,7 +99,7 @@ def main(): pipeline_errors = {} pipeline_logger.info("Loading OPA properties dataset.") - dataset, opa_validation = opa_properties() + dataset, opa_validation = opa_properties(gdf=gpd.GeoDataFrame()) pipeline_logger.info("OPA properties loaded.") # Check for missing zoning values after OPA properties @@ -166,15 +168,13 @@ def main(): # Save metadata try: - # Initialize collected_metadata if it doesn't exist (since services return GeoDataFrame, not FeatureLayer) - if not hasattr(dataset, "collected_metadata"): - dataset.collected_metadata = [] - - if dataset.collected_metadata: - # Create tmp directory if it doesn't exist - os.makedirs("tmp", exist_ok=True) - metadata_df = pd.DataFrame(dataset.collected_metadata) - metadata_df.to_csv("tmp/metadata.csv", index=False) + if current_metadata: + metadata_df = pd.DataFrame(current_metadata) + metadata_file_path = file_manager.get_file_path( + "metadata.csv", load_type=LoadType.TEMP + ) + print(metadata_file_path) + pd.DataFrame(metadata_df).to_csv(metadata_file_path, index=False) else: print("No collected_metadata found in dataset - skipping metadata save") except Exception as e: diff --git a/data/src/metadata/metadata_utils.py b/data/src/metadata/metadata_utils.py index b94e2591..4fac0bb8 100644 --- a/data/src/metadata/metadata_utils.py +++ b/data/src/metadata/metadata_utils.py @@ -4,6 +4,7 @@ import sys import time from datetime import datetime, timezone +from typing import Any, List import geopandas as gpd @@ -21,12 +22,12 @@ "columns updated", "source", "known issues", - "primary feature layer columns referenced", + "columns referenced", ] METADATA_FIELD_TYPES = { "columns added": "columns", # with types "columns updated": "columns", # without types - "primary feature layer columns referenced": "column_names", + "columns referenced": "column_names", } @@ -158,7 +159,9 @@ def parse_docstring(docstring): return result -def detect_added_columns(df_before, df_after): +def detect_added_columns( + df_before: gpd.GeoDataFrame, df_after: gpd.GeoDataFrame +) -> set[str]: """ Detects columns that have been added in df_after compared to df_before. Handles cases where df_before is None or empty. @@ -168,13 +171,14 @@ def detect_added_columns(df_before, df_after): return set(df_after.columns) - set(df_before.columns) -def provide_metadata(): +def provide_metadata(current_metadata: List[dict[str, Any]]): """ Decorator to collect metadata from ETL functions. - The collected metadata is stored in the`collected_metadata` attribute of the FeatureLayer object. + The collected metadata is stored in the an inputted `current_metadata` object that is passed into + each decorator attached to a data service. - Apply this decorator by adding `@provide_metadata()` above the function definition. + Apply this decorator by adding `@provide_metadata(current_metadata)` above the function definition. The metadata collects info from the docstring in the following format: @@ -197,7 +201,7 @@ def provide_metadata(): Columns updated: column_name: Description of how this column was changed. - Primary Feature Layer Columns Referenced: + Columns referenced: column_name (data_type): Description of how this column is referenced. Source: @@ -211,31 +215,21 @@ def provide_metadata(): def decorator(func): @functools.wraps(func) - def wrapper(primary_featurelayer=None): + def wrapper(gdf: gpd.GeoDataFrame): # Run the function and collect metadata # including start time, end time, and duration - if ( - primary_featurelayer is None - or not hasattr(primary_featurelayer, "gdf") - or primary_featurelayer.gdf is None - ): - gdf_before = gpd.GeoDataFrame() - current_metadata = [] - else: - gdf_before = primary_featurelayer.gdf.copy() - current_metadata = primary_featurelayer.collected_metadata + start_gdf = gdf if not gdf.empty else gpd.GeoDataFrame() + start_time_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") start_time = time.time() - if primary_featurelayer is None: - primary_featurelayer = func() - else: - primary_featurelayer = func(primary_featurelayer) + + end_gdf, validation = func(gdf) + end_time = time.time() end_time_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") try: - gdf_after = primary_featurelayer.gdf.copy() - detected_columns_added = detect_added_columns(gdf_before, gdf_after) + detected_columns_added = detect_added_columns(start_gdf, end_gdf) func_name = func.__name__ doc_meta = parse_docstring(func.__doc__) @@ -260,7 +254,8 @@ def wrapper(primary_featurelayer=None): f"Listed in docstring: {names_of_columns_added}" ) - primary_featurelayer.collected_metadata = current_metadata + [metadata] + current_metadata.append(metadata) + except Exception as e: print("Failed to collect metadata for", func.__name__) print(type(e), e) @@ -269,10 +264,14 @@ def wrapper(primary_featurelayer=None): "name": func.__name__, "description": "Failed to collect metadata", } - primary_featurelayer.collected_metadata = current_metadata + [metadata] - return primary_featurelayer + current_metadata.append(metadata) + + return end_gdf, validation return wrapper return decorator + + +current_metadata: List[dict] = [] diff --git a/data/src/test/data_utils/test_data_utils.py b/data/src/test/data_utils/test_data_utils.py index 9eae8c71..b786e591 100644 --- a/data/src/test/data_utils/test_data_utils.py +++ b/data/src/test/data_utils/test_data_utils.py @@ -23,7 +23,7 @@ class TestDataUtils(unittest.TestCase): """ - Test methods for data utils feature layer classes + Test methods for data utils GeoDataFrame classes """ @classmethod @@ -408,7 +408,7 @@ def test_vacant_properties(self): def test_pwd_parcels_merge(self): """ This tests that the merge_pwd_parcels_gdf function correctly retains - existing point geometries in the primary GeoDataFrame when no better + existing point geometries in the input GeoDataFrame when no better geometry is available in the PWD parcels GeoDataFrame. """ diff --git a/data/src/test/test_metadata_utils.py b/data/src/test/test_metadata_utils.py index 7e4bd3c1..a41556a0 100644 --- a/data/src/test/test_metadata_utils.py +++ b/data/src/test/test_metadata_utils.py @@ -9,7 +9,6 @@ get_sections_from_docstring, normalize_whitespace, parse_docstring, - provide_metadata, ) # Stub functions with actual docstrings used for parsing tests @@ -22,7 +21,7 @@ def stub_update_vacant_community(primary_featurelayer): as vacant, preventing potential predatory development. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The feature layer containing property data. Returns: FeatureLayer: The input feature layer with the 'vacant' column updated to False @@ -34,7 +33,7 @@ def stub_update_vacant_community(primary_featurelayer): Columns updated: vacant: Updated to False for parcels containing community gardens. - Primary Feature Layer Columns Referenced: + Columns referenced: vacant, ipa_id """ pass @@ -45,7 +44,7 @@ def stub_kde_analysis(primary_featurelayer): Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer. Args: - primary_featurelayer (FeatureLayer): The feature layer containing property data. + input_gdf (GeoDataFrame): The feature layer containing property data. Returns: FeatureLayer: The input feature layer with KDE analysis results for drug crimes. @@ -95,7 +94,7 @@ def stub_update_vacant_ppr(primary_featurelayer): are marked as not vacant. This prevents PPR properties from being miscategorized as vacant. Args: - primary_featurelayer (FeatureLayer): The primary feature layer to update. + input_gdf (GeoDataFrame): The primary feature layer to update. Returns: FeatureLayer: The updated primary feature layer. @@ -122,7 +121,7 @@ def stub_columns_added_variation(primary_featurelayer): Function with a docstring that uses 'Columns Added:' key variation. Args: - primary_featurelayer (FeatureLayer): The feature layer. + input_gdf (GeoDataFrame): The feature layer. Returns: FeatureLayer: The updated feature layer. @@ -142,7 +141,7 @@ def stub_only_args_and_returns(primary_featurelayer): Function with only args and returns sections. Args: - primary_featurelayer (FeatureLayer): The feature layer. + input_gdf (GeoDataFrame): The feature layer. Returns: FeatureLayer: The updated feature layer. @@ -150,14 +149,13 @@ def stub_only_args_and_returns(primary_featurelayer): pass -@provide_metadata() @pytest.mark.skip def sample_add_columns(primary_featurelayer): """ Adds columns to the primary feature layer. Args: - primary_featurelayer (FeatureLayer): The feature layer to update. + input_gdf (GeoDataFrame): The feature layer to update. Returns: FeatureLayer: The updated primary feature layer. @@ -252,7 +250,7 @@ def test_parse_docstring(self): "columns added": [], "source": "", "known issues": "", - "primary feature layer columns referenced": ["vacant", "ipa_id"], + "columns referenced": ["vacant", "ipa_id"], }, ), ( @@ -265,7 +263,7 @@ def test_parse_docstring(self): "columns added": [], "columns updated": [], "known issues": "", - "primary feature layer columns referenced": [], + "columns referenced": [], }, ), ( @@ -279,7 +277,7 @@ def test_parse_docstring(self): "columns added": "list_of_12", "columns updated": [], "known issues": "", - "primary feature layer columns referenced": [], + "columns referenced": [], }, ), ( @@ -302,7 +300,7 @@ def test_parse_docstring(self): "known issues": ( "If the Ersi REST URL is not available the function" # NOTE: because the next line has a colon, only the first line is captured ), - "primary feature layer columns referenced": [], + "columns referenced": [], }, ), ( @@ -326,7 +324,7 @@ def test_parse_docstring(self): "columns updated": [], "source": "", "known issues": "", - "primary feature layer columns referenced": [], + "columns referenced": [], }, ), ( @@ -339,7 +337,7 @@ def test_parse_docstring(self): "columns updated": [], "source": "", "known issues": "", - "primary feature layer columns referenced": [], + "columns referenced": [], }, ), ] diff --git a/data/src/validation/base.py b/data/src/validation/base.py index 4969403f..057b9c6e 100644 --- a/data/src/validation/base.py +++ b/data/src/validation/base.py @@ -3,7 +3,7 @@ import time from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Callable, List, Optional +from typing import Callable, List, Optional, Tuple import geopandas as gpd import pandas as pd @@ -680,11 +680,13 @@ def _validate_application_specific(self, gdf: gpd.GeoDataFrame, errors: list): def validate_output( - validator_cls: BaseValidator, + validator_cls: type[BaseValidator], ): - def decorator(func: Callable[[gpd.GeoDataFrame], gpd.GeoDataFrame]): + def decorator( + func: Callable[[gpd.GeoDataFrame], Tuple[gpd.GeoDataFrame, ValidationResult]], + ): @functools.wraps(func) - def wrapper(gdf: gpd.GeoDataFrame = None, *args, **kwargs): + def wrapper(gdf: gpd.GeoDataFrame, *args, **kwargs): decorator_start = time.time() # Create validator diff --git a/data/test_service.py b/data/test_service.py index a6bebd63..af69ace3 100644 --- a/data/test_service.py +++ b/data/test_service.py @@ -7,6 +7,8 @@ import sys +import pytest + from src.classes.loaders import BaseLoader from src.config.config import enable_statistical_summaries @@ -153,6 +155,7 @@ def run_dependencies(dataset, dependencies): return dataset +@pytest.mark.skip def test_service(service_name: str): """Test a specific service with the base OPA properties dataset."""