diff --git a/data/src/main.py b/data/src/main.py index 66a30287..bc97fa43 100644 --- a/data/src/main.py +++ b/data/src/main.py @@ -1,14 +1,42 @@ import sys -import pandas as pd import traceback +import pandas as pd from config.psql import conn -from config.config import tiles_file_id_prefix - -from new_etl.classes.slack_reporters import send_dataframe_profile_to_slack, send_pg_stats_to_slack, send_error_to_slack from new_etl.classes.data_diff import DiffReport -from new_etl.data_utils import * -from new_etl.database import to_postgis_with_schema +from new_etl.classes.slack_reporters import ( + send_dataframe_profile_to_slack, + send_error_to_slack, + send_pg_stats_to_slack, +) +from new_etl.data_utils import ( + city_owned_properties, + community_gardens, + conservatorship, + contig_neighbors, + council_dists, + delinquencies, + dev_probability, + drug_crimes, + gun_crimes, + imm_dang_buildings, + li_complaints, + li_violations, + nbhoods, + negligent_devs, + owner_type, + park_priority, + phs_properties, + ppr_properties, + pwd_parcels, + rco_geoms, + tactical_urbanism, + tree_canopy, + unsafe_buildings, + vacant_properties, +) + +from config.config import tiles_file_id_prefix # Ensure the directory containing awkde is in the Python path awkde_path = "/usr/src/app" @@ -17,7 +45,6 @@ try: - print("Starting ETL process.") services = [ @@ -58,6 +85,12 @@ dataset = priority_level(dataset) dataset = access_process(dataset) + # Save metadata + try: + metadata_df = pd.DataFrame(dataset.collected_metadata) + metadata_df.to_csv("tmp/metadata.csv", index=False) + except Exception as e: + print(f"Error saving metadata: {str(e)}") # Drop duplicates before_drop = dataset.gdf.shape[0] dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") @@ -72,8 +105,12 @@ "num_years_owed", "permit_count", ] - dataset.gdf[numeric_columns] = dataset.gdf[numeric_columns].apply(pd.to_numeric, errors="coerce") - dataset.gdf["most_recent_year_owed"] = dataset.gdf["most_recent_year_owed"].astype(str) + dataset.gdf[numeric_columns] = dataset.gdf[numeric_columns].apply( + pd.to_numeric, errors="coerce" + ) + dataset.gdf["most_recent_year_owed"] = dataset.gdf["most_recent_year_owed"].astype( + str + ) # Dataset profiling send_dataframe_profile_to_slack(dataset.gdf, "all_properties_end") diff --git a/data/src/new_etl/classes/featurelayer.py b/data/src/new_etl/classes/featurelayer.py index 9e5fd08b..9d42f584 100644 --- a/data/src/new_etl/classes/featurelayer.py +++ b/data/src/new_etl/classes/featurelayer.py @@ -2,11 +2,20 @@ import os import subprocess import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed import geopandas as gpd import pandas as pd import requests import sqlalchemy as sa +from config.psql import conn, local_engine +from google.cloud import storage +from google.cloud.storage.bucket import Bucket +from new_etl.database import to_postgis_with_schema +from new_etl.loaders import load_carto_data, load_esri_data +from shapely import wkb +from tqdm import tqdm + from config.config import ( FORCE_RELOAD, USE_CRS, @@ -14,15 +23,6 @@ min_tiles_file_size_in_bytes, write_production_tiles_file, ) -from config.psql import conn, local_engine -from google.cloud import storage -from google.cloud.storage.bucket import Bucket -from shapely import wkb -from concurrent.futures import ThreadPoolExecutor, as_completed -from tqdm import tqdm - -from new_etl.loaders import load_esri_data, load_carto_data -from new_etl.database import to_postgis_with_schema log.basicConfig(level=log_level) @@ -33,8 +33,8 @@ def google_cloud_bucket() -> Bucket: Returns: Bucket: the gcp bucket """ - credentials_path = os.path.expanduser("/app/service-account-key.json") + credentials_path = os.path.expanduser("/app/service-account-key.json") if not os.path.exists(credentials_path): raise FileNotFoundError(f"Credentials file not found at {credentials_path}") @@ -63,7 +63,12 @@ def __init__( cols: list[str] = None, max_workers=os.cpu_count(), chunk_size=100000, + collected_metadata=None, ): + if collected_metadata is None: + self.collected_metadata = [] + else: + self.collected_metadata = collected_metadata self.name = name self.esri_rest_urls = ( [esri_rest_urls] if isinstance(esri_rest_urls, str) else esri_rest_urls @@ -84,7 +89,6 @@ def __init__( inputs = [self.esri_rest_urls, self.carto_sql_queries, self.gdf] non_none_inputs = [i for i in inputs if i is not None] - if len(non_none_inputs) > 0: self.type = ( "esri" diff --git a/data/src/new_etl/data_utils/access_process.py b/data/src/new_etl/data_utils/access_process.py index ae3af8e6..235acbb9 100644 --- a/data/src/new_etl/data_utils/access_process.py +++ b/data/src/new_etl/data_utils/access_process.py @@ -1,6 +1,9 @@ from typing import Any +from new_etl.metadata.metadata_utils import provide_metadata + +@provide_metadata() def access_process(dataset: Any) -> Any: """ Process a dataset to determine the access process for each property based on @@ -13,6 +16,15 @@ def access_process(dataset: Any) -> Any: Returns: Any: The updated dataset with an additional "access_process" column. + Tagline: + Assigns access processes + + Columns added: + access_process (str): The access process for each property based on city ownership and market value. + + Primary Feature Layer Columns Referenced: + city_owner_agency, market_value + Side Effects: Prints the distribution of the "access_process" column. """ diff --git a/data/src/new_etl/data_utils/city_owned_properties.py b/data/src/new_etl/data_utils/city_owned_properties.py index 7e967712..3b714daf 100644 --- a/data/src/new_etl/data_utils/city_owned_properties.py +++ b/data/src/new_etl/data_utils/city_owned_properties.py @@ -1,7 +1,9 @@ from ..classes.featurelayer import FeatureLayer from ..constants.services import CITY_OWNED_PROPERTIES_TO_LOAD +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def city_owned_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Processes city-owned property data by joining it with the primary feature layer, @@ -15,6 +17,20 @@ def city_owned_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The updated primary feature layer with processed city ownership information. + + Columns added: + city_owner_agency (str): The agency that owns the city property. + side_yard_eligible (str): Indicates if the property is eligible for the side yard program. + + Primary Feature Layer Columns Referenced: + opa_id, owner_1, owner2 + + Tagline: + Categorizes City Owned Properties + + Source: + https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/LAMAAssets/FeatureServer/0/ + """ city_owned_properties = FeatureLayer( name="City Owned Properties", diff --git a/data/src/new_etl/data_utils/community_gardens.py b/data/src/new_etl/data_utils/community_gardens.py index 80fa00df..ba72d9b3 100644 --- a/data/src/new_etl/data_utils/community_gardens.py +++ b/data/src/new_etl/data_utils/community_gardens.py @@ -1,13 +1,35 @@ +from config.config import USE_CRS + from ..classes.featurelayer import FeatureLayer from ..constants.services import COMMUNITY_GARDENS_TO_LOAD -from config.config import USE_CRS +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def community_gardens(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Updates the 'vacant' column in the primary feature layer to ensure community gardens are marked as not vacant. This protects known community gardens from being categorized as vacant, preventing potential predatory development. + + Args: + primary_featurelayer (FeatureLayer): The feature layer containing property data. + + Returns: + FeatureLayer: The input feature layer with the 'vacant' column updated to False + for parcels containing community gardens. + + Tagline: + Mark Community Gardens as Not Vacant + + Columns updated: + vacant: Updated to False for parcels containing community gardens. + + Primary Feature Layer Columns Referenced: + opa_id, vacant + + Source: + https://services2.arcgis.com/qjOOiLCYeUtwT7x7/arcgis/rest/services/PHS_NGT_Supported_Current_view/FeatureServer/0/ """ if "vacant" not in primary_featurelayer.gdf.columns: raise ValueError("The 'vacant' column is missing in the primary feature layer.") diff --git a/data/src/new_etl/data_utils/conservatorship.py b/data/src/new_etl/data_utils/conservatorship.py index 4e53c0fe..559664c0 100644 --- a/data/src/new_etl/data_utils/conservatorship.py +++ b/data/src/new_etl/data_utils/conservatorship.py @@ -1,7 +1,10 @@ -from ..classes.featurelayer import FeatureLayer import datetime -from dateutil.parser import parse + import pytz +from dateutil.parser import parse + +from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata est = pytz.timezone("US/Eastern") six_months_ago = (datetime.datetime.now() - datetime.timedelta(days=180)).astimezone( @@ -9,6 +12,7 @@ ) +@provide_metadata() def conservatorship(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Determines conservatorship eligibility for properties in a feature layer. @@ -16,6 +20,15 @@ def conservatorship(primary_featurelayer: FeatureLayer) -> FeatureLayer: Args: primary_featurelayer (FeatureLayer): A feature layer containing property data in a GeoDataFrame (`gdf`). + Columns Added: + conservatorship (str): Indicates whether each property qualifies for conservatorship ("Yes" or "No"). + + Primary Feature Layer Columns Referenced: + city_owner_agency, sheriff_sale, market_value, all_violations_past_year, sale_date + + Tagline: + Identify conservatorship-eligible properties + Returns: FeatureLayer: The input feature layer with an added "conservatorship" column indicating whether each property qualifies for conservatorship ("Yes" or "No"). diff --git a/data/src/new_etl/data_utils/contig_neighbors.py b/data/src/new_etl/data_utils/contig_neighbors.py index c05f9d4c..80eb385d 100644 --- a/data/src/new_etl/data_utils/contig_neighbors.py +++ b/data/src/new_etl/data_utils/contig_neighbors.py @@ -1,10 +1,14 @@ import warnings + import networkx as nx -from libpysal.weights import Queen import numpy as np +from libpysal.weights import Queen + from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def contig_neighbors(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Calculates the number of contiguous vacant neighbors for each property in a feature layer. @@ -15,6 +19,15 @@ def contig_neighbors(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with an added "n_contiguous" column indicating the number of contiguous vacant neighbors for each property. + + Tagline: + Count vacant neighbors + + Columns Added: + n_contiguous (int): The number of contiguous vacant neighbors for each property. + + Primary Feature Layer Columns Referenced: + opa_id, vacant """ # Create a filtered dataframe with only vacant properties and polygon geometries vacant_parcels = primary_featurelayer.gdf.loc[ diff --git a/data/src/new_etl/data_utils/council_dists.py b/data/src/new_etl/data_utils/council_dists.py index 74709709..100c7009 100644 --- a/data/src/new_etl/data_utils/council_dists.py +++ b/data/src/new_etl/data_utils/council_dists.py @@ -1,10 +1,13 @@ +import pandas as pd + from ..classes.featurelayer import FeatureLayer from ..constants.services import COUNCIL_DISTRICTS_TO_LOAD -import pandas as pd +from ..metadata.metadata_utils import provide_metadata pd.set_option("future.no_silent_downcasting", True) +@provide_metadata() def council_dists(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Associates properties in the primary feature layer with council districts @@ -16,6 +19,15 @@ def council_dists(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with properties spatially joined to council districts, ensuring no duplicate entries. + + Tagline: + Assigns council districts + + Columns added: + district (str): The council district associated with the property. + + Primary Feature Layer Columns Referenced: + opa_id, geometry """ # Load council districts council_dists = FeatureLayer( diff --git a/data/src/new_etl/data_utils/delinquencies.py b/data/src/new_etl/data_utils/delinquencies.py index 700372b2..0701289a 100644 --- a/data/src/new_etl/data_utils/delinquencies.py +++ b/data/src/new_etl/data_utils/delinquencies.py @@ -1,7 +1,9 @@ from ..classes.featurelayer import FeatureLayer from ..constants.services import DELINQUENCIES_QUERY +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def delinquencies(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Adds property tax delinquency information to the primary feature layer by @@ -13,6 +15,24 @@ def delinquencies(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with added columns for tax delinquency information, including total due, actionable status, payment agreements, and more. + + Tagline: + Summarize tax delinquencies + + Source: + https://phl.carto.com/api/v2/sql + + Columns Added: + total_due (float): Total amount owed. + most_recent_year_owed (str): Most recent year owed. + num_years_owed (int): Number of years owed. + payment_agreement (str): Indicates if there is a payment agreement. + is_actionable (str): Flag for actionable tax delinquency. + sheriff_sale (str): Indicates if the property is at risk of sheriff sale. + total_assessment (float): Total property assessment. + + Primary Feature Layer Columns Referenced: + opa_id """ tax_delinquencies = FeatureLayer( name="Property Tax Delinquencies", diff --git a/data/src/new_etl/data_utils/dev_probability.py b/data/src/new_etl/data_utils/dev_probability.py index 325b2e04..f69c51b5 100644 --- a/data/src/new_etl/data_utils/dev_probability.py +++ b/data/src/new_etl/data_utils/dev_probability.py @@ -2,11 +2,15 @@ import jenkspy import pandas as pd import requests + +from config.config import USE_CRS + from ..classes.featurelayer import FeatureLayer from ..constants.services import CENSUS_BGS_URL, PERMITS_QUERY -from config.config import USE_CRS +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def dev_probability(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Calculates development probability based on permit counts and assigns @@ -19,6 +23,19 @@ def dev_probability(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with added spatial join data for development probability and ranks. + + Tagline: + Calculate development probability + + Columns Added: + permit_count (int): The number of permits issued in the census block group. + dev_rank (str): The development rank of the census block group. + + Primary Feature Layer Columns Referenced: + opa_id, geometry + + Source: + https://phl.carto.com/api/v2/sql """ census_bgs_gdf = gpd.read_file(CENSUS_BGS_URL) census_bgs_gdf = census_bgs_gdf.to_crs(USE_CRS) diff --git a/data/src/new_etl/data_utils/drug_crimes.py b/data/src/new_etl/data_utils/drug_crimes.py index 479ac08a..a14c51e8 100644 --- a/data/src/new_etl/data_utils/drug_crimes.py +++ b/data/src/new_etl/data_utils/drug_crimes.py @@ -1,8 +1,11 @@ +from new_etl.data_utils.kde import apply_kde_to_primary + from ..classes.featurelayer import FeatureLayer from ..constants.services import DRUGCRIME_SQL_QUERY -from new_etl.data_utils.kde import apply_kde_to_primary +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def drug_crimes(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer. @@ -12,6 +15,22 @@ def drug_crimes(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with KDE analysis results for drug crimes. + + Tagline: + Density analysis for drug crimes + + Columns added: + drug_crimes_density (float): KDE density of drug crimes. + drug_crimes_density_zscore (float): Z-score of drug crime density. + drug_crimes_density_label (str): Categorized density level. + drug_crimes_density_percentile (float): Percentile rank of density. + + Primary Feature Layer Columns Referenced: + geometry + + Source: + https://phl.carto.com/api/v2/sql + """ return apply_kde_to_primary( primary_featurelayer, "Drug Crimes", DRUGCRIME_SQL_QUERY diff --git a/data/src/new_etl/data_utils/gun_crimes.py b/data/src/new_etl/data_utils/gun_crimes.py index e9f2d1fd..1bfd8da5 100644 --- a/data/src/new_etl/data_utils/gun_crimes.py +++ b/data/src/new_etl/data_utils/gun_crimes.py @@ -1,8 +1,11 @@ +from new_etl.data_utils.kde import apply_kde_to_primary + from ..classes.featurelayer import FeatureLayer from ..constants.services import GUNCRIME_SQL_QUERY -from new_etl.data_utils.kde import apply_kde_to_primary +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def gun_crimes(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Applies kernel density estimation (KDE) analysis for gun crimes to the primary feature layer. @@ -12,5 +15,20 @@ def gun_crimes(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with KDE analysis results for gun crimes. + + Tagline: + Analyzes gun crime density + + Columns added: + gun_crimes_density (float): KDE density of gun crimes. + gun_crimes_density_zscore (float): Z-score of gun crime density. + gun_crimes_density_label (str): Categorized density level. + gun_crimes_density_percentile (float): Percentile rank of density. + + Primary Feature Layer Columns Referenced: + geometry + + Source: + https://phl.carto.com/api/v2/sql """ return apply_kde_to_primary(primary_featurelayer, "Gun Crimes", GUNCRIME_SQL_QUERY) diff --git a/data/src/new_etl/data_utils/imm_dang_buildings.py b/data/src/new_etl/data_utils/imm_dang_buildings.py index 5163be63..415046fa 100644 --- a/data/src/new_etl/data_utils/imm_dang_buildings.py +++ b/data/src/new_etl/data_utils/imm_dang_buildings.py @@ -1,7 +1,9 @@ from ..classes.featurelayer import FeatureLayer from ..constants.services import IMMINENT_DANGER_BUILDINGS_QUERY +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def imm_dang_buildings(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Adds information about imminently dangerous buildings to the primary feature layer @@ -13,6 +15,18 @@ def imm_dang_buildings(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with an added "imm_dang_building" column, indicating whether each property is categorized as imminently dangerous ("Y" or "N"). + + Tagline: + Identify imminently dangerous buildings + + Columns Added: + imm_dang_building (str): Indicates whether each property is categorized as imminently dangerous ("Y" or "N"). + + Primary Feature Layer Columns Referenced: + opa_id + + Source: + https://phl.carto.com/api/v2/sql """ imm_dang_buildings = FeatureLayer( name="Imminently Dangerous Buildings", diff --git a/data/src/new_etl/data_utils/li_complaints.py b/data/src/new_etl/data_utils/li_complaints.py index 3778cd23..c4fb3d3a 100644 --- a/data/src/new_etl/data_utils/li_complaints.py +++ b/data/src/new_etl/data_utils/li_complaints.py @@ -1,8 +1,10 @@ from ..classes.featurelayer import FeatureLayer from ..constants.services import COMPLAINTS_SQL_QUERY from ..data_utils.kde import apply_kde_to_primary +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def li_complaints(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Applies kernel density estimation (KDE) analysis for L&I complaints to the primary feature layer. @@ -13,6 +15,22 @@ def li_complaints(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with KDE analysis results for L&I complaints, including density and derived metrics. + + Tagline: + Analyzes L&I complaint density + + Columns added: + l_and_i_complaints_density (float): KDE density of complaints. + l_and_i_complaints_density_zscore (float): Z-score of complaint density. + l_and_i_complaints_density_label (str): Categorized density level. + l_and_i_complaints_density_percentile (float): Percentile rank of density. + + Primary Feature Layer Columns Referenced: + geometry + + Source: + https://phl.carto.com/api/v2/sql + """ return apply_kde_to_primary( primary_featurelayer, "L and I Complaints", COMPLAINTS_SQL_QUERY diff --git a/data/src/new_etl/data_utils/li_violations.py b/data/src/new_etl/data_utils/li_violations.py index efd52db5..d53fec85 100644 --- a/data/src/new_etl/data_utils/li_violations.py +++ b/data/src/new_etl/data_utils/li_violations.py @@ -1,10 +1,14 @@ -import pandas as pd -import geopandas as gpd from typing import List + +import geopandas as gpd +import pandas as pd + from ..classes.featurelayer import FeatureLayer from ..constants.services import VIOLATIONS_SQL_QUERY +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def li_violations(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Process L&I (Licenses and Inspections) data for violations. @@ -18,6 +22,19 @@ def li_violations(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The primary feature layer updated with L&I data. + + Tagline: + Counts L&I violations + + Columns added: + all_violations_past_year (int): Total violations in the past year. + open_violations_past_year (int): Open violations in the past year. + + Source: + https://phl.carto.com/api/v2/sql + + Primary Feature Layer Columns Referenced: + opa_id """ keywords: List[str] = [ "dumping", diff --git a/data/src/new_etl/data_utils/nbhoods.py b/data/src/new_etl/data_utils/nbhoods.py index 31c1f0a3..854df31c 100644 --- a/data/src/new_etl/data_utils/nbhoods.py +++ b/data/src/new_etl/data_utils/nbhoods.py @@ -1,9 +1,13 @@ import geopandas as gpd + +from config.config import USE_CRS + from ..classes.featurelayer import FeatureLayer from ..constants.services import NBHOODS_URL -from config.config import USE_CRS +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def nbhoods(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Adds neighborhood information to the primary feature layer by performing a spatial join @@ -15,6 +19,18 @@ def nbhoods(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with an added "neighborhood" column, containing the name of the neighborhood for each property. + + Tagline: + Assigns neighborhoods + + Columns added: + neighborhood (str): The name of the neighborhood associated with the property. + + Primary Feature Layer Columns Referenced: + opa_id, geometry + + Source: + https://raw.githubusercontent.com/opendataphilly/open-geo-data/master/philadelphia-neighborhoods/philadelphia-neighborhoods.geojson """ phl_nbhoods = gpd.read_file(NBHOODS_URL) diff --git a/data/src/new_etl/data_utils/negligent_devs.py b/data/src/new_etl/data_utils/negligent_devs.py index 194eb2cc..26609d46 100644 --- a/data/src/new_etl/data_utils/negligent_devs.py +++ b/data/src/new_etl/data_utils/negligent_devs.py @@ -1,6 +1,8 @@ from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def negligent_devs(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Identifies negligent developers based on the number of vacant properties owned @@ -9,6 +11,17 @@ def negligent_devs(primary_featurelayer: FeatureLayer) -> FeatureLayer: Args: primary_featurelayer (FeatureLayer): The feature layer containing property data. + Columns Added: + negligent_dev (bool): non-city owned entities owning 5+ vacant properties + n_total_properties_owned (int): Total number of properties owned by the developer + n_vacant_properties_owned (int): Number of vacant properties owned by the developer + + Primary Feature Layer Columns Referenced: + opa_id, vacant, city_owner_agency, standardized_address + + Tagline: + Identify negligent developers + Returns: FeatureLayer: The input feature layer with additional columns for total properties owned, vacant properties owned, and a "negligent_dev" flag. diff --git a/data/src/new_etl/data_utils/opa_properties.py b/data/src/new_etl/data_utils/opa_properties.py index 5b6ce34c..08894ac9 100644 --- a/data/src/new_etl/data_utils/opa_properties.py +++ b/data/src/new_etl/data_utils/opa_properties.py @@ -1,5 +1,8 @@ -import pandas as pd import re + +import pandas as pd +from new_etl.metadata.metadata_utils import provide_metadata + from ..classes.featurelayer import FeatureLayer from ..constants.services import OPA_PROPERTIES_QUERY @@ -75,12 +78,33 @@ def create_standardized_address(row: pd.Series) -> str: return standardized_address.lower() +@provide_metadata() def opa_properties() -> FeatureLayer: """ Loads and processes OPA property data, standardizing addresses and cleaning geometries. Returns: FeatureLayer: A feature layer containing processed OPA property data. + + Columns Added: + opa_id (int): the OPA ID of the property + market_value (float): the market value from the OPA data + sale_date (str): the date of the last sale + sale_price (float): the price of the last sale + parcel_type (str): "Land" or "Building" + zip_code (str): The zip code of the property + zoning (str): The zoning of the property + owner_1 (str): The first owner of the property + owner_2 (str): The second owner of the property + building_code_description (str): The building code description + standardized_address (str): A standardized mailing address + geometry (geometry): The geometry of the property + + Source: + https://phl.carto.com/api/v2/sql + + Tagline: + Load OPA data """ opa = FeatureLayer( name="OPA Properties", @@ -104,7 +128,6 @@ def opa_properties() -> FeatureLayer: "zoning", ], ) - # Rename columns opa.gdf = opa.gdf.rename(columns={"parcel_number": "opa_id"}) diff --git a/data/src/new_etl/data_utils/owner_type.py b/data/src/new_etl/data_utils/owner_type.py index bcae8e00..b899ebc5 100644 --- a/data/src/new_etl/data_utils/owner_type.py +++ b/data/src/new_etl/data_utils/owner_type.py @@ -1,7 +1,10 @@ import pandas as pd + from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def owner_type(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Determines the ownership type for each property in the primary feature layer based on @@ -15,6 +18,15 @@ def owner_type(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The updated feature layer with the 'owner_type' column added. + + Tagline: + Assigns ownership types + + Columns added: + owner_type (str): The ownership type of the property: Public, Business (LLC) or Individual. + + Primary Feature Layer Columns Referenced: + opa_id, owner_1, owner_2, city_owner_agency """ owner_types = [] diff --git a/data/src/new_etl/data_utils/park_priority.py b/data/src/new_etl/data_utils/park_priority.py index a35b652e..4c47b314 100644 --- a/data/src/new_etl/data_utils/park_priority.py +++ b/data/src/new_etl/data_utils/park_priority.py @@ -4,12 +4,15 @@ from typing import List, Union import geopandas as gpd +import pyogrio import requests from bs4 import BeautifulSoup -from ..classes.featurelayer import FeatureLayer -from config.config import USE_CRS from tqdm import tqdm -import pyogrio + +from config.config import USE_CRS + +from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata def get_latest_shapefile_url() -> str: @@ -84,6 +87,7 @@ def download_and_process_shapefile( return phl_parks +@provide_metadata() def park_priority(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Downloads and processes park priority data, then joins it with the primary feature layer. @@ -93,6 +97,18 @@ def park_priority(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The primary feature layer with park priority data joined. + + Tagline: + Labels high-priority park areas. + + Columns Added: + park_priority (int): The park priority score. + + Primary Feature Layer Columns Referenced: + opa_id, geometry + + Source: + https://www.tpl.org/park-data-downloads """ park_url: str = get_latest_shapefile_url() print(f"Downloading park priority data from: {park_url}") diff --git a/data/src/new_etl/data_utils/phs_properties.py b/data/src/new_etl/data_utils/phs_properties.py index fb0f20be..1410e457 100644 --- a/data/src/new_etl/data_utils/phs_properties.py +++ b/data/src/new_etl/data_utils/phs_properties.py @@ -1,7 +1,9 @@ from ..classes.featurelayer import FeatureLayer from ..constants.services import PHS_LAYERS_TO_LOAD +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Perform a spatial join between the primary feature layer and the PHS properties layer, @@ -13,6 +15,15 @@ def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The updated primary feature layer with the 'phs_care_program' column. + + Tagline: + Identifies PHS Care properties + + Columns added: + phs_care_program (str): The PHS care program associated with the property. + + Primary Feature Layer Columns Referenced: + opa_id, geometry """ phs_properties = FeatureLayer( diff --git a/data/src/new_etl/data_utils/ppr_properties.py b/data/src/new_etl/data_utils/ppr_properties.py index 84c16a8b..7394e2cb 100644 --- a/data/src/new_etl/data_utils/ppr_properties.py +++ b/data/src/new_etl/data_utils/ppr_properties.py @@ -1,11 +1,16 @@ import io + import geopandas as gpd import requests + +from config.config import USE_CRS + from ..classes.featurelayer import FeatureLayer from ..constants.services import PPR_PROPERTIES_TO_LOAD -from config.config import USE_CRS +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def ppr_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Updates the 'vacant' column in the primary feature layer to ensure PPR properties @@ -17,6 +22,23 @@ def ppr_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The updated primary feature layer. + + Columns Updated: + vacant: Updated to False for PPR properties. + + Tagline: + Mark Parks as Not Vacant + + Source: + https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0 + + Known Issues: + If the Ersi REST URL is not available the function + will fall back to loading the data from a GeoJSON URL + https://opendata.arcgis.com/datasets/d52445160ab14380a673e5849203eb64_0.geojson + + Primary Feature Layer Columns Referenced: + opa_id, geometry, vacant, public_name """ fallback_url = "https://opendata.arcgis.com/datasets/d52445160ab14380a673e5849203eb64_0.geojson" diff --git a/data/src/new_etl/data_utils/priority_level.py b/data/src/new_etl/data_utils/priority_level.py index 6c0525c6..90e29da3 100644 --- a/data/src/new_etl/data_utils/priority_level.py +++ b/data/src/new_etl/data_utils/priority_level.py @@ -1,7 +1,10 @@ import pandas as pd +from new_etl.metadata.metadata_utils import provide_metadata + from ..classes.featurelayer import FeatureLayer +@provide_metadata() def priority_level(dataset: FeatureLayer) -> FeatureLayer: """ Determines priority levels for properties based on gun crime density, @@ -13,6 +16,17 @@ def priority_level(dataset: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with an added "priority_level" column, indicating the priority for each property as "Low", "Medium", or "High". + + Columns Added: + priority_level (str): The priority level ( "Low", "Medium", or "High") of the property + based on gun crime density, violations, tree canopy gaps, and PHS Landcare status. + + Tagline: + Add priority levels + + Source: + gun_crimes_density_zscore, all_violations_past_year, l_and_i_complaints_density_zscore, + tree_canopy_gap, phs_care_program columns in the primary feature layer. """ priority_levels = [] for idx, row in dataset.gdf.iterrows(): diff --git a/data/src/new_etl/data_utils/pwd_parcels.py b/data/src/new_etl/data_utils/pwd_parcels.py index 0bde9b59..1cc46c61 100644 --- a/data/src/new_etl/data_utils/pwd_parcels.py +++ b/data/src/new_etl/data_utils/pwd_parcels.py @@ -1,8 +1,11 @@ +import geopandas as gpd + from ..classes.featurelayer import FeatureLayer from ..constants.services import PWD_PARCELS_QUERY -import geopandas as gpd +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def pwd_parcels(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Updates the primary feature layer by replacing its geometry column with validated @@ -15,6 +18,18 @@ def pwd_parcels(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The updated primary feature layer with geometries replaced by those from PWD parcels or retained from the original layer if no match. + + Columns Updated: + geometry: The geometry column is updated with validated geometries from PWD parcels. + + Primary Feature Layer Columns Referenced: + opa_id, geometry + + Tagline: + Improve geometry with PWD parcels data. + + Source: + https://phl.carto.com/api/v2/sql """ # Load PWD parcels pwd_parcels = FeatureLayer( diff --git a/data/src/new_etl/data_utils/rco_geoms.py b/data/src/new_etl/data_utils/rco_geoms.py index 504a8d90..3b294656 100644 --- a/data/src/new_etl/data_utils/rco_geoms.py +++ b/data/src/new_etl/data_utils/rco_geoms.py @@ -1,10 +1,13 @@ +import pandas as pd + from ..classes.featurelayer import FeatureLayer from ..constants.services import RCOS_LAYERS_TO_LOAD -import pandas as pd +from ..metadata.metadata_utils import provide_metadata pd.set_option("future.no_silent_downcasting", True) +@provide_metadata() def rco_geoms(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Adds Registered Community Organization (RCO) information to the primary feature layer @@ -16,6 +19,22 @@ def rco_geoms(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with added RCO-related columns, including aggregated RCO information and names. + + Tagline: + Assigns Community Org Info + + Columns added: + rco_names (str): Names of RCOs associated with the property. + rco_info (str): Additional RCO-related information. + + Source: + "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Zoning_RCO/FeatureServer/0/" + + Notes: + Modifies various columns. Fillna and infer_objects is applied to most columns. + + Primary Feature Layer Columns Referenced: + opa_id, geometry """ rco_geoms = FeatureLayer(name="RCOs", esri_rest_urls=RCOS_LAYERS_TO_LOAD) diff --git a/data/src/new_etl/data_utils/tactical_urbanism.py b/data/src/new_etl/data_utils/tactical_urbanism.py index 68ca85e2..e862a2c3 100644 --- a/data/src/new_etl/data_utils/tactical_urbanism.py +++ b/data/src/new_etl/data_utils/tactical_urbanism.py @@ -1,6 +1,8 @@ from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def tactical_urbanism(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Assigns a 'tactical_urbanism' value to each row in the primary feature layer based on specific conditions. @@ -11,6 +13,15 @@ def tactical_urbanism(primary_featurelayer: FeatureLayer) -> FeatureLayer: Args: primary_featurelayer: A FeatureLayer object containing a GeoDataFrame (`gdf`) as an attribute. + Columns Added: + tactical_urbanism (str): Indicates whether each property qualifies for tactical urbanism ("Yes" or "No"). + + Primary Feature Layer Columns Referenced: + parcel_type, unsafe_building, imm_dang_building + + Tagline: + Identify tactical urbanism-eligible properties + Returns: The input FeatureLayer with a new column 'tactical_urbanism' added to its GeoDataFrame. """ diff --git a/data/src/new_etl/data_utils/tree_canopy.py b/data/src/new_etl/data_utils/tree_canopy.py index 9e67b4e2..8b5632ec 100644 --- a/data/src/new_etl/data_utils/tree_canopy.py +++ b/data/src/new_etl/data_utils/tree_canopy.py @@ -1,11 +1,16 @@ -import requests import io import zipfile + import geopandas as gpd -from ..classes.featurelayer import FeatureLayer +import requests + from config.config import USE_CRS +from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata + +@provide_metadata() def tree_canopy(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Adds tree canopy gap information to the primary feature layer by downloading, @@ -17,6 +22,18 @@ def tree_canopy(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with an added "tree_canopy_gap" column indicating the tree canopy gap for each property. + + Tagline: + Measures tree canopy gaps. + + Columns added: + tree_canopy_gap (float): The amount of tree canopy lacking. + + Primary Feature Layer Columns Referenced: + opa_id, geometry + + Source: + https://national-tes-data-share.s3.amazonaws.com/national_tes_share/pa.zip.zip """ tree_url = ( "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/pa.zip.zip" diff --git a/data/src/new_etl/data_utils/unsafe_buildings.py b/data/src/new_etl/data_utils/unsafe_buildings.py index 655621a3..1e096529 100644 --- a/data/src/new_etl/data_utils/unsafe_buildings.py +++ b/data/src/new_etl/data_utils/unsafe_buildings.py @@ -1,7 +1,9 @@ from ..classes.featurelayer import FeatureLayer from ..constants.services import UNSAFE_BUILDINGS_QUERY +from ..metadata.metadata_utils import provide_metadata +@provide_metadata() def unsafe_buildings(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Adds unsafe building information to the primary feature layer by joining with a dataset @@ -13,6 +15,18 @@ def unsafe_buildings(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with an added "unsafe_building" column, indicating whether each property is categorized as an unsafe building ("Y" or "N"). + + Tagline: + Identify unsafe buildings + + Columns Added: + unsafe_building (str): Indicates whether each property is categorized as an unsafe building ("Y" or "N"). + + Primary Feature Layer Columns Referenced: + opa_id + + Source: + https://phl.carto.com/api/v2/sql """ unsafe_buildings = FeatureLayer( name="Unsafe Buildings", diff --git a/data/src/new_etl/data_utils/vacant_properties.py b/data/src/new_etl/data_utils/vacant_properties.py index 84845ee4..9b4b56f0 100644 --- a/data/src/new_etl/data_utils/vacant_properties.py +++ b/data/src/new_etl/data_utils/vacant_properties.py @@ -1,9 +1,12 @@ -from ..classes.featurelayer import FeatureLayer, google_cloud_bucket -from ..constants.services import VACANT_PROPS_LAYERS_TO_LOAD -import geopandas as gpd from io import BytesIO + +import geopandas as gpd import pandas as pd +from ..classes.featurelayer import FeatureLayer, google_cloud_bucket +from ..constants.services import VACANT_PROPS_LAYERS_TO_LOAD +from ..metadata.metadata_utils import provide_metadata + def load_backup_data_from_gcs(file_name: str) -> pd.DataFrame: """ @@ -54,6 +57,7 @@ def check_null_percentage(df: pd.DataFrame, threshold: float = 0.05) -> None: ) +@provide_metadata() def vacant_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ Adds a "vacant" column to the primary feature layer based on vacant property data from @@ -64,6 +68,18 @@ def vacant_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: Returns: FeatureLayer: The input feature layer with an added "vacant" column. + + Tagline: + Identify vacant properties. + + Columns Added: + vacant (bool): Indicates whether the property is vacant. + + Primary Feature Layer Columns Referenced: + opa_id + + Known Issues: + - The vacant land data is below the threshold, so backup data is loaded from GCS. """ vacant_properties = FeatureLayer( name="Vacant Properties", diff --git a/data/src/new_etl/metadata/metadata_utils.py b/data/src/new_etl/metadata/metadata_utils.py new file mode 100644 index 00000000..fbdec1d8 --- /dev/null +++ b/data/src/new_etl/metadata/metadata_utils.py @@ -0,0 +1,278 @@ +import functools +import logging as log +import re +import sys +import time +from datetime import datetime, timezone + +import geopandas as gpd + +from config.config import log_level + +log.basicConfig(level=log_level) + +DESCRIPTION_REGEX = r"^(?P.*?)(?=\n\s*\w+:\s|$)" +SECTION_REGEX = r"^\s*(?P[\w\s]+):\s*(?P.*?)(?=^\s*[\w\s]+:\s|\Z)" +METADATA_FIELDS = [ + "description", + "returns", + "tagline", + "columns added", + "columns updated", + "source", + "known issues", + "primary feature layer columns referenced", +] +METADATA_FIELD_TYPES = { + "columns added": "columns", # with types + "columns updated": "columns", # without types + "primary feature layer columns referenced": "column_names", +} + + +def normalize_whitespace(text): + """Convert newlines to spaces and collapse multiple spaces into one.""" + text = text.replace("\n", " ") + text = re.sub(r"\s+", " ", text) + return text.strip() + + +def get_description_from_docstring(docstring): + """ + Extract the description from the docstring. + + Extracts all text before the first section header (e.g. Args:, Returns:). + + """ + + # Regex to capture the "description": everything until a section header + description_pattern = re.compile(DESCRIPTION_REGEX, re.DOTALL) + description_match = description_pattern.search(docstring) + description = ( + description_match.group("description").strip() if description_match else "" + ) + return description + + +def get_sections_from_docstring(docstring): + section_pattern = re.compile( + SECTION_REGEX, + re.DOTALL | re.MULTILINE, + ) + sections = { + m.group("key").lower(): m.group("value").strip() + for m in section_pattern.finditer(docstring) + } + return sections + + +def get_column_details(text): + """ + Parse the column details from the text in the format: + "column_name (data_type): description" + """ + pattern = r"(\w+)(?:\s+\((\w+)\))?:\s+(.+)" + + matches = re.findall(pattern, text) + + # Convert to structured data with default type as "unknown" + parsed_columns = [] + for name, dtype, desc in matches: + column = { + "name": name.strip(), + "description": desc.strip(), + } + if dtype: # Only add 'type' if dtype is not empty + column["type"] = dtype.strip() + + parsed_columns.append(column) + + return parsed_columns + + +def clean_docstring(docstring): + """ + trim function from PEP-257 + + Ensures that the docstring is clean, uniformly indented, and free of extraneous whitespace. + """ + if not docstring: + return "" + + # Convert tabs to spaces (following the normal Python rules) + # and split into a list of lines: + lines = docstring.expandtabs().splitlines() + # Determine minimum indentation (first line doesn't count): + indent = sys.maxsize + for line in lines[1:]: + stripped = line.lstrip() + if stripped: + indent = min(indent, len(line) - len(stripped)) + # Remove indentation (first line is special): + trimmed = [lines[0].strip()] + if indent < sys.maxsize: + for line in lines[1:]: + trimmed.append(line[indent:].rstrip()) + # Strip off trailing and leading blank lines: + while trimmed and not trimmed[-1]: + trimmed.pop() + while trimmed and not trimmed[0]: + trimmed.pop(0) + + # Current code/unittests expects a line return at + # end of multiline docstrings + # workaround expected behavior from unittests + if "\n" in docstring: + trimmed.append("") + + # Return a single string: + return "\n".join(trimmed) + + +def parse_docstring(docstring): + """Parse the docstring into its components.""" + # Capture the "description" which is everything before the first header + if not docstring: + return {} + docstring = clean_docstring(docstring.lstrip("\n")) + description = get_description_from_docstring(docstring) + sections = get_sections_from_docstring(docstring) + sections["description"] = description + + # "columns added" and "columns updated" require special handling + # to breakdown the columns listed in the docstring + + result = {} + for field in METADATA_FIELDS: + if METADATA_FIELD_TYPES.get(field, "text") == "columns": + result[field] = get_column_details(sections.get(field, "")) + elif METADATA_FIELD_TYPES.get(field, "text") == "column_names": + result[field] = [ + col.strip() for col in sections.get(field, "").split(",") if col.strip() + ] + else: + result[field] = ( + normalize_whitespace(sections[field]) if sections.get(field) else "" + ) + + return result + + +def detect_added_columns(df_before, df_after): + """ + Detects columns that have been added in df_after compared to df_before. + Handles cases where df_before is None or empty. + """ + if df_before is None or df_before.empty: + return set(df_after.columns) + return set(df_after.columns) - set(df_before.columns) + + +def provide_metadata(): + """ + Decorator to collect metadata from ETL functions. + + The collected metadata is stored in the`collected_metadata` attribute of the FeatureLayer object. + + Apply this decorator by adding `@provide_metadata()` above the function definition. + + The metadata collects info from the docstring in the following format: + + ''' + Description of what the function does. + + Args: + param1 (Type): Description of parameter 1. + param2 (Type): Description of parameter 2. + + Returns: + ReturnType: Description of the return value. + + Tagline: + A very short summary of the function for use in DAG graphs. + + Columns added: + column_name (data_type): Description of what this new column represents. + + Columns updated: + column_name: Description of how this column was changed. + + Primary Feature Layer Columns Referenced: + column_name (data_type): Description of how this column is referenced. + + Source: + URL or reference for additional context. + + Known issues: + Any known issues or limitations with this function. + + ''' + """ + + def decorator(func): + @functools.wraps(func) + def wrapper(primary_featurelayer=None): + # Run the function and collect metadata + # including start time, end time, and duration + + if ( + primary_featurelayer is None + or not hasattr(primary_featurelayer, "gdf") + or primary_featurelayer.gdf is None + ): + gdf_before = gpd.GeoDataFrame() + current_metadata = [] + else: + gdf_before = primary_featurelayer.gdf.copy() + current_metadata = primary_featurelayer.collected_metadata + start_time_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + start_time = time.time() + if primary_featurelayer is None: + primary_featurelayer = func() + else: + primary_featurelayer = func(primary_featurelayer) + end_time = time.time() + end_time_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + try: + gdf_after = primary_featurelayer.gdf.copy() + detected_columns_added = detect_added_columns(gdf_before, gdf_after) + + func_name = func.__name__ + doc_meta = parse_docstring(func.__doc__) + + metadata = { + "name": func_name, + "start_time": start_time_str, + "end_time": end_time_str, + "duration_in_seconds": round(end_time - start_time, 2), + } + + for field in METADATA_FIELDS: + metadata[field.replace(" ", "_")] = doc_meta.get(field, "") + + names_of_columns_added = set( + [col["name"] for col in metadata.get("columns_added", [])] + ) + if detected_columns_added != names_of_columns_added: + log.debug( + "Columns added doesn't match columns listed as added in the docstring:" + f"Detected: {detected_columns_added}" + f"Listed in docstring: {names_of_columns_added}" + ) + + primary_featurelayer.collected_metadata = current_metadata + [metadata] + except Exception as e: + print("Failed to collect metadata for", func.__name__) + print(type(e), e) + log.error(e, exc_info=True) + metadata = { + "name": func.__name__, + "description": "Failed to collect metadata", + } + primary_featurelayer.collected_metadata = current_metadata + [metadata] + + return primary_featurelayer + + return wrapper + + return decorator diff --git a/data/src/test/test_metadata_utils.py b/data/src/test/test_metadata_utils.py new file mode 100644 index 00000000..f8f186eb --- /dev/null +++ b/data/src/test/test_metadata_utils.py @@ -0,0 +1,404 @@ +import unittest + +import geopandas as gpd +from new_etl.classes.featurelayer import FeatureLayer + +from ..new_etl.metadata.metadata_utils import ( + get_column_details, + get_description_from_docstring, + get_sections_from_docstring, + normalize_whitespace, + parse_docstring, + provide_metadata, +) + +# Stub functions with actual docstrings used for parsing tests + + +def stub_update_vacant_community(primary_featurelayer): + """ + Updates the 'vacant' column in the primary feature layer to ensure community gardens + are marked as not vacant. This protects known community gardens from being categorized + as vacant, preventing potential predatory development. + + Args: + primary_featurelayer (FeatureLayer): The feature layer containing property data. + + Returns: + FeatureLayer: The input feature layer with the 'vacant' column updated to False + for parcels containing community gardens. + + Tagline: + Mark Community Gardens as Not Vacant + + Columns updated: + vacant: Updated to False for parcels containing community gardens. + + Primary Feature Layer Columns Referenced: + vacant, ipa_id + """ + pass + + +def stub_kde_analysis(primary_featurelayer): + """ + Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer. + + Args: + primary_featurelayer (FeatureLayer): The feature layer containing property data. + + Returns: + FeatureLayer: The input feature layer with KDE analysis results for drug crimes. + + Tagline: + Density analysis for drug crimes + + Source: + https://phl.carto.com/api/v2/sql + """ + pass + + +def stub_load_opa(): + """ + Loads and processes OPA property data, standardizing addresses and cleaning geometries. + + Returns: + FeatureLayer: A feature layer containing processed OPA property data. + + Columns added: + opa_id (type): desc + market_value (type): desc + sale_date (type): desc + sale_price (numeric): desc + parcel_type (type): desc + zip_code (type): desc + zoning (type): desc + owner_1 (type): desc + owner_2 (type): desc + building_code_description (str): desc + standardized_address (str): A standardized mailing address + geometry (type): desc + + Source: + https://phl.carto.com/api/v2/sql + + Tagline: + Load OPA data + """ + pass + + +def stub_update_vacant_ppr(primary_featurelayer): + """ + Updates the 'vacant' column in the primary feature layer to ensure PPR properties + are marked as not vacant. This prevents PPR properties from being miscategorized as vacant. + + Args: + primary_featurelayer (FeatureLayer): The primary feature layer to update. + + Returns: + FeatureLayer: The updated primary feature layer. + + Columns updated: + vacant: Updated to False for PPR properties. + + Tagline: + Mark Parks as Not Vacant + + Source: + https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0 + + Known issues: + If the Ersi REST URL is not available the function + will fall back to loading the data from a GeoJSON URL: + https://opendata.arcgis.com/datasets/d52445160ab14380a673e5849203eb64_0.geojson + """ + pass + + +def stub_columns_added_variation(primary_featurelayer): + """ + Function with a docstring that uses 'Columns Added:' key variation. + + Args: + primary_featurelayer (FeatureLayer): The feature layer. + + Returns: + FeatureLayer: The updated feature layer. + + Columns Added: + column_x (int): Some description for column_x. + column_y (float): Some description for column_y. + + Tagline: + Example tagline + """ + pass + + +def stub_only_args_and_returns(primary_featurelayer): + """ + Function with only args and returns sections. + + Args: + primary_featurelayer (FeatureLayer): The feature layer. + + Returns: + FeatureLayer: The updated feature layer. + """ + pass + + +@provide_metadata() +def sample_add_columns(primary_featurelayer): + """ + Adds columns to the primary feature layer. + + Args: + primary_featurelayer (FeatureLayer): The feature layer to update. + + Returns: + FeatureLayer: The updated primary feature layer. + + Columns added: + column1 (int): Description for column1. + + Tagline: + Example tagline + """ + new_layer = FeatureLayer( + name="stub_add_columns", + ) + + new_layer.gdf = gpd.GeoDataFrame( + data={"opa_number": ["1", "2", "3"], "column1": [1, 2, 3]} + ) + primary_featurelayer.opa_join( + new_layer.gdf, + "opa_number", + ) + + return primary_featurelayer + + +class TestMetadataUtils(unittest.TestCase): + def test_normalize_whitespace(self): + test_cases = [ + ("Hello world", "Hello world"), + ("Line1\nLine2", "Line1 Line2"), + (" Leading and multiple spaces ", "Leading and multiple spaces"), + ] + for input_text, expected in test_cases: + with self.subTest(input_text=input_text): + result = normalize_whitespace(input_text) + self.assertEqual(result, expected) + + def test_get_description_from_docstring(self): + # Ensure description extraction stops before a section header. + docstring = ( + "This is the function description.\n\n" + "Args:\n param (int): parameter description" + ) + expected = "This is the function description." + result = get_description_from_docstring(docstring) + self.assertEqual(result, expected) + + def test_get_sections_from_docstring(self): + # Test that sections are correctly extracted. + docstring = ( + "This is a description.\n\n" + "Args:\n param (int): description\n\n" + "Returns:\n int: result" + ) + sections = get_sections_from_docstring(docstring) + self.assertIn("args", sections) + self.assertIn("returns", sections) + self.assertTrue(sections["args"].strip()) + self.assertTrue(sections["returns"].strip()) + + def test_get_column_details(self): + # Test column details extraction. + text = "col1 (int): description for col1\ncol2 (str): description for col2" + expected = [ + {"name": "col1", "type": "int", "description": "description for col1"}, + {"name": "col2", "type": "str", "description": "description for col2"}, + ] + result = get_column_details(text) + self.assertEqual(result, expected) + + def test_parse_docstring(self): + test_cases = [ + ( + stub_update_vacant_community.__doc__, + { + "description": ( + "Updates the 'vacant' column in the primary feature layer to ensure community gardens " + "are marked as not vacant. This protects known community gardens from being categorized as " + "vacant, preventing potential predatory development." + ), + "returns": ( + "FeatureLayer: The input feature layer with the 'vacant' column updated to False for parcels " + "containing community gardens." + ), + "tagline": "Mark Community Gardens as Not Vacant", + "columns updated": [ + { + "name": "vacant", + "description": "Updated to False for parcels containing community gardens.", + } + ], + "columns added": [], + "source": "", + "known issues": "", + "primary feature layer columns referenced": ["vacant", "ipa_id"], + }, + ), + ( + stub_kde_analysis.__doc__, + { + "description": "Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer.", + "returns": "FeatureLayer: The input feature layer with KDE analysis results for drug crimes.", + "tagline": "Density analysis for drug crimes", + "source": "https://phl.carto.com/api/v2/sql", + "columns added": [], + "columns updated": [], + "known issues": "", + "primary feature layer columns referenced": [], + }, + ), + ( + stub_load_opa.__doc__, + { + "description": "Loads and processes OPA property data, standardizing addresses and cleaning geometries.", + "returns": "FeatureLayer: A feature layer containing processed OPA property data.", + "tagline": "Load OPA data", + "source": "https://phl.carto.com/api/v2/sql", + # For columns added, we expect a list of 12 dictionaries. + "columns added": "list_of_12", + "columns updated": [], + "known issues": "", + "primary feature layer columns referenced": [], + }, + ), + ( + stub_update_vacant_ppr.__doc__, + { + "description": ( + "Updates the 'vacant' column in the primary feature layer to ensure PPR properties " + "are marked as not vacant. This prevents PPR properties from being miscategorized as vacant." + ), + "returns": "FeatureLayer: The updated primary feature layer.", + "tagline": "Mark Parks as Not Vacant", + "source": "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0", + "columns updated": [ + { + "name": "vacant", + "description": "Updated to False for PPR properties.", + } + ], + "columns added": [], + "known issues": ( + "If the Ersi REST URL is not available the function" # NOTE: because the next line has a colon, only the first line is captured + ), + "primary feature layer columns referenced": [], + }, + ), + ( + stub_columns_added_variation.__doc__, + { + "description": "Function with a docstring that uses 'Columns Added:' key variation.", + "returns": "FeatureLayer: The updated feature layer.", + "tagline": "Example tagline", + "columns added": [ + { + "name": "column_x", + "type": "int", + "description": "Some description for column_x.", + }, + { + "name": "column_y", + "type": "float", + "description": "Some description for column_y.", + }, + ], + "columns updated": [], + "source": "", + "known issues": "", + "primary feature layer columns referenced": [], + }, + ), + ( + stub_only_args_and_returns.__doc__, + { + "description": "Function with only args and returns sections.", + "returns": "FeatureLayer: The updated feature layer.", + "tagline": "", + "columns added": [], + "columns updated": [], + "source": "", + "known issues": "", + "primary feature layer columns referenced": [], + }, + ), + ] + for docstring, expected_metadata in test_cases: + with self.subTest(docstring=docstring): + print(expected_metadata["description"]) + metadata = parse_docstring(docstring) + # For stub_load_opa, we expect a list of 12 columns. + if expected_metadata["columns added"] == "list_of_12": + self.assertIsInstance(metadata.get("columns added"), list) + self.assertEqual(len(metadata.get("columns added")), 12) + else: + self.assertEqual( + metadata.get("columns added"), + expected_metadata["columns added"], + ) + fields = metadata.keys() + for field in fields: + if field == "columns added": + continue + self.assertEqual(metadata.get(field), expected_metadata[field]) + + def test_provide_metadata_with_sample_add_columns(self): + # Test that the metadata is correctly added to the function output. + primary_featurelayer = FeatureLayer(name="stub") + primary_featurelayer.gdf = gpd.GeoDataFrame( + data={ + "opa_id": ["1", "2", "3"], + "existing_field": [1, 2, 3], + "geometry": gpd.points_from_xy([1, 2, 3], [1, 2, 3]), + } + ) + result = sample_add_columns(primary_featurelayer) + metadata = result.collected_metadata + + expected_metadata = { + "name": "sample_add_columns", + "description": "Adds columns to the primary feature layer.", + "returns": "FeatureLayer: The updated primary feature layer.", + "start_time": "2021-10-01 00:00:00", + "end_time": "2021-10-01 00:00:00", + "duration_in_seconds": 0.0, + "tagline": "Example tagline", + "columns_added": [ + { + "name": "column1", + "type": "int", + "description": "Description for column1.", + } + ], + "columns_updated": [], + "source": "", + "known_issues": "", + "primary_feature_layer_columns_referenced": [], + } + most_recent_metadata = metadata[-1] + fields = most_recent_metadata.keys() + assert sorted(fields) == sorted(expected_metadata.keys()) + for field in fields: + if field in ["start_time", "end_time", "duration_in_seconds"]: + continue + self.assertEqual(most_recent_metadata.get(field), expected_metadata[field]) + self.assertEqual(most_recent_metadata.get(field), expected_metadata[field]) diff --git a/docs/ETL pipeline/DAG.md b/docs/ETL pipeline/DAG.md new file mode 100644 index 00000000..dbd64978 --- /dev/null +++ b/docs/ETL pipeline/DAG.md @@ -0,0 +1,60 @@ +## DAG for the new ETL Pipline + +In practice, functions are run sequentially but this DAG shows dependencies on prior data modifications/additions. + +```mermaid +%%{init: {'flowchart': {'nodeSpacing': 100, 'rankSpacing': 50}}}%% +graph LR + %% Initial Ingestion + OP[opa_properties
Load OPA data] + VP[vacant_properties
Identify vacant properties] + + %% First updates from ingestion + OP --> PP[pwd_parcels
Improve geometry with PWD parcels data] + OP --> LV[li_violations
Counts L&I violations] + OP --> CO[city_owned_properties
Categorizes City Owned Properties] + OP --> DL[delinquencies
Summarize tax delinquencies] + OP --> UB[unsafe_buildings
Identify unsafe buildings] + OP --> IDB[imm_dang_buildings
Identify imminently dangerous buildings] + + VP --> CG[community_gardens
Mark Community Gardens as Not Vacant] + VP --> PPR[ppr_properties
Mark Parks as Not Vacant] + + %% Branches from pwd_parcels (updated geometry) + PP --> CD[council_dists
Assigns council districts] + PP --> NB[nbhoods
Assigns neighborhoods] + PP --> RC[rco_geoms
Assigns Community Org Info] + PP --> PH[phs_properties
Identifies PHS Care properties] + PP --> LC[li_complaints
Analyzes L&I complaint density] + PP --> TC[tree_canopy
Measures tree canopy gaps] + PP --> GC[gun_crimes
Analyzes gun crime density] + PP --> DC[drug_crimes
Density analysis for drug crimes] + PP --> DP[dev_probability
Calculate development probability] + PP --> PPri[park_priority
Labels high-priority park areas] + + %% Updates from city ownership branch + CO --> OT[owner_type
Assigns ownership types] + CO --> CV[conservatorship
Identify conservatorship-eligible properties] + CO --> AP[access_process
Assigns access processes] + + %% Additional dependencies feeding into conservatorship + LV --> CV + DL --> CV + + %% Combining multiple updates for tactical urbanism + UB --> TU[tactical_urbanism
Identify tactical urbanism-eligible properties] + IDB --> TU + + %% Vacant branch updates feeding subsequent functions + CG --> CN[contig_neighbors
Count vacant neighbors] + PPR --> CN + CG --> ND[negligent_devs
Identify negligent developers] + PPR --> ND + + %% Priority level depends on several geometry-based outputs + GC --> PL[priority_level
Add priority levels] + LV --> PL + LC --> PL + TC --> PL + PH --> PL +``` diff --git a/docs/ETL pipeline/DataDict.md b/docs/ETL pipeline/DataDict.md new file mode 100644 index 00000000..b17cdf0b --- /dev/null +++ b/docs/ETL pipeline/DataDict.md @@ -0,0 +1,98 @@ +# New ETL Pipeline + +## Overview + +This document describes the datasets used in the pipeline, including their sources, attributes, update frequency, and known issues. + +## Data Dictionary + +| Column Name | Added by Function | Data Type | Description | Function Known Issues | Source | Function Description | Updated by Functions | +| :------------------------------------ | :-------------------- | :-------- | :----------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------------- | +| access_process | access_process | str | The access process for each property based on city ownership and market value. | | | Process a dataset to determine the access process for each property based on city ownership and market value. The result is added as a new column in the dataset. | | +| all_violations_past_year | li_violations | int | Total violations in the past year. | | https://phl.carto.com/api/v2/sql | Process L&I (Licenses and Inspections) data for violations. This function filters and processes L&I violations data, joining it with the primary feature layer based on spatial relationships and OPA (Office of Property Assessment) identifiers. | | +| building_code_description | opa_properties | str | The building code description | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| city_owner_agency | city_owned_properties | str | The agency that owns the city property. | | https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/LAMAAssets/FeatureServer/0/ | Processes city-owned property data by joining it with the primary feature layer, renaming columns, and updating access information for properties based on ownership. All instances where the "city_owner_agency" is "PLB" are changed to "Land Bank (PHDC)". | | +| conservatorship | conservatorship | str | Indicates whether each property qualifies for conservatorship ("Yes" or "No"). | | | Determines conservatorship eligibility for properties in a feature layer. | | +| dev_rank | dev_probability | str | The development rank of the census block group. | | https://phl.carto.com/api/v2/sql | Calculates development probability based on permit counts and assigns development ranks to census block groups. The results are joined to the primary feature layer. | | +| district | council_dists | str | The council district associated with the property. | | | Associates properties in the primary feature layer with council districts using a spatial join. | | +| drug_crimes_density | drug_crimes | float | KDE density of drug crimes. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer. | | +| drug_crimes_density_label | drug_crimes | str | Categorized density level. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer. | | +| drug_crimes_density_percentile | drug_crimes | float | Percentile rank of density. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer. | | +| drug_crimes_density_zscore | drug_crimes | float | Z-score of drug crime density. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer. | | +| geometry | opa_properties | geometry | The geometry of the property | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | pwd_parcels | +| gun_crimes_density | gun_crimes | float | KDE density of gun crimes. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for gun crimes to the primary feature layer. | | +| gun_crimes_density_label | gun_crimes | str | Categorized density level. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for gun crimes to the primary feature layer. | | +| gun_crimes_density_percentile | gun_crimes | float | Percentile rank of density. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for gun crimes to the primary feature layer. | | +| gun_crimes_density_zscore | gun_crimes | float | Z-score of gun crime density. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for gun crimes to the primary feature layer. | | +| imm_dang_building | imm_dang_buildings | str | Indicates whether each property is categorized as imminently dangerous ("Y" or "N"). | | https://phl.carto.com/api/v2/sql | Adds information about imminently dangerous buildings to the primary feature layer by joining with a dataset of dangerous buildings. | | +| is_actionable | delinquencies | str | Flag for actionable tax delinquency. | | https://phl.carto.com/api/v2/sql | Adds property tax delinquency information to the primary feature layer by joining with a tax delinquencies dataset. | | +| l_and_i_complaints_density | li_complaints | float | KDE density of complaints. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for L&I complaints to the primary feature layer. | | +| l_and_i_complaints_density_label | li_complaints | str | Categorized density level. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for L&I complaints to the primary feature layer. | | +| l_and_i_complaints_density_percentile | li_complaints | float | Percentile rank of density. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for L&I complaints to the primary feature layer. | | +| l_and_i_complaints_density_zscore | li_complaints | float | Z-score of complaint density. | | https://phl.carto.com/api/v2/sql | Applies kernel density estimation (KDE) analysis for L&I complaints to the primary feature layer. | | +| market_value | opa_properties | float | the market value from the OPA data | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| most_recent_year_owed | delinquencies | str | Most recent year owed. | | https://phl.carto.com/api/v2/sql | Adds property tax delinquency information to the primary feature layer by joining with a tax delinquencies dataset. | | +| n_contiguous | contig_neighbors | int | The number of contiguous vacant neighbors for each property. | | | Calculates the number of contiguous vacant neighbors for each property in a feature layer. | | +| n_total_properties_owned | negligent_devs | int | Total number of properties owned by the developer | | | Identifies negligent developers based on the number of vacant properties owned and flags them in the primary feature layer. | | +| n_vacant_properties_owned | negligent_devs | int | Number of vacant properties owned by the developer | | | Identifies negligent developers based on the number of vacant properties owned and flags them in the primary feature layer. | | +| negligent_dev | negligent_devs | bool | non-city owned entities owning 5+ vacant properties | | | Identifies negligent developers based on the number of vacant properties owned and flags them in the primary feature layer. | | +| neighborhood | nbhoods | str | The name of the neighborhood associated with the property. | | https://raw.githubusercontent.com/opendataphilly/open-geo-data/master/philadelphia-neighborhoods/philadelphia-neighborhoods.geojson | Adds neighborhood information to the primary feature layer by performing a spatial join with a neighborhoods dataset. | | +| num_years_owed | delinquencies | int | Number of years owed. | | https://phl.carto.com/api/v2/sql | Adds property tax delinquency information to the primary feature layer by joining with a tax delinquencies dataset. | | +| opa_id | opa_properties | int | the OPA ID of the property | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| open_violations_past_year | li_violations | int | Open violations in the past year. | | https://phl.carto.com/api/v2/sql | Process L&I (Licenses and Inspections) data for violations. This function filters and processes L&I violations data, joining it with the primary feature layer based on spatial relationships and OPA (Office of Property Assessment) identifiers. | | +| owner_1 | opa_properties | str | The first owner of the property | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| owner_2 | opa_properties | str | The second owner of the property | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| owner_type | owner_type | str | The ownership type of the property: Public, Business (LLC) or Individual. | | | Determines the ownership type for each property in the primary feature layer based on the 'owner_1', 'owner_2', and 'city_owner_agency' columns. The ownership type is set as: - "Public" if 'city_owner_agency' is not NA. - "Business (LLC)" if 'city_owner_agency' is NA and "LLC" is found in 'owner_1' or 'owner_2'. - "Individual" if 'city_owner_agency' is NA and "LLC" is not found in 'owner_1' or 'owner_2'. | | +| parcel_type | opa_properties | str | "Land" or "Building" | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| park_priority | park_priority | int | The park priority score. | | https://www.tpl.org/park-data-downloads | Downloads and processes park priority data, then joins it with the primary feature layer. | | +| payment_agreement | delinquencies | str | Indicates if there is a payment agreement. | | https://phl.carto.com/api/v2/sql | Adds property tax delinquency information to the primary feature layer by joining with a tax delinquencies dataset. | | +| permit_count | dev_probability | int | The number of permits issued in the census block group. | | https://phl.carto.com/api/v2/sql | Calculates development probability based on permit counts and assigns development ranks to census block groups. The results are joined to the primary feature layer. | | +| phs_care_program | phs_properties | str | The PHS care program associated with the property. | | | Perform a spatial join between the primary feature layer and the PHS properties layer, then update the primary feature layer with a new column 'phs_care_program' indicating if the property is part of the PHS care program. | | +| priority_level | priority_level | str | The priority level ( "Low", "Medium", or "High") of the property | | gun_crimes_density_zscore, all_violations_past_year, l_and_i_complaints_density_zscore, tree_canopy_gap, phs_care_program columns in the primary feature layer. | Determines priority levels for properties based on gun crime density, violations, tree canopy gaps, and PHS Landcare status. | | +| rco_info | rco_geoms | str | Additional RCO-related information. | | "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Zoning_RCO/FeatureServer/0/" | Adds Registered Community Organization (RCO) information to the primary feature layer by performing a spatial join and aggregating RCO data. | | +| rco_names | rco_geoms | str | Names of RCOs associated with the property. | | "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Zoning_RCO/FeatureServer/0/" | Adds Registered Community Organization (RCO) information to the primary feature layer by performing a spatial join and aggregating RCO data. | | +| sale_date | opa_properties | str | the date of the last sale | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| sale_price | opa_properties | float | the price of the last sale | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| sheriff_sale | delinquencies | str | Indicates if the property is at risk of sheriff sale. | | https://phl.carto.com/api/v2/sql | Adds property tax delinquency information to the primary feature layer by joining with a tax delinquencies dataset. | | +| side_yard_eligible | city_owned_properties | str | Indicates if the property is eligible for the side yard program. | | https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/LAMAAssets/FeatureServer/0/ | Processes city-owned property data by joining it with the primary feature layer, renaming columns, and updating access information for properties based on ownership. All instances where the "city_owner_agency" is "PLB" are changed to "Land Bank (PHDC)". | | +| standardized_address | opa_properties | str | A standardized mailing address | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| tactical_urbanism | tactical_urbanism | str | Indicates whether each property qualifies for tactical urbanism ("Yes" or "No"). | | | Assigns a 'tactical_urbanism' value to each row in the primary feature layer based on specific conditions. Tactical urbanism is marked as "Yes" if the property is a parcel of type 'Land', and does not have any unsafe or immediately dangerous buildings. Otherwise, it is "No". | | +| total_assessment | delinquencies | float | Total property assessment. | | https://phl.carto.com/api/v2/sql | Adds property tax delinquency information to the primary feature layer by joining with a tax delinquencies dataset. | | +| total_due | delinquencies | float | Total amount owed. | | https://phl.carto.com/api/v2/sql | Adds property tax delinquency information to the primary feature layer by joining with a tax delinquencies dataset. | | +| tree_canopy_gap | tree_canopy | float | The amount of tree canopy lacking. | | https://national-tes-data-share.s3.amazonaws.com/national_tes_share/pa.zip.zip | Adds tree canopy gap information to the primary feature layer by downloading, processing, and spatially joining tree canopy data for Philadelphia County. | | +| unsafe_building | unsafe_buildings | str | Indicates whether each property is categorized as an unsafe building ("Y" or "N"). | | https://phl.carto.com/api/v2/sql | Adds unsafe building information to the primary feature layer by joining with a dataset of unsafe buildings. | | +| vacant | vacant_properties | bool | Indicates whether the property is vacant. | - The vacant land data is below the threshold, so backup data is loaded from GCS. | | Adds a "vacant" column to the primary feature layer based on vacant property data from ESRI layers and backup data from Google Cloud Storage if necessary. | community_gardens -> ppr_properties | +| zip_code | opa_properties | str | The zip code of the property | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | +| zoning | opa_properties | str | The zoning of the property | | https://phl.carto.com/api/v2/sql | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | | + +## By Function + +| name | start_time | end_time | duration | description | returns | tagline | columns_added | columns_updated | source | known_issues | primary_feature_layer_columns_referenced | +| :-------------------- | :------------------ | :------------------ | :------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------ | :-------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------- | +| opa_properties | 2025-03-20 01:45:03 | 2025-03-20 01:45:38 | 34.72 seconds | Loads and processes OPA property data, standardizing addresses and cleaning geometries. | FeatureLayer: A feature layer containing processed OPA property data. | Load OPA data | [{'name': 'opa_id', 'description': 'the OPA ID of the property', 'type': 'int'}, {'name': 'market_value', 'description': 'the market value from the OPA data', 'type': 'float'}, {'name': 'sale_date', 'description': 'the date of the last sale', 'type': 'str'}, {'name': 'sale_price', 'description': 'the price of the last sale', 'type': 'float'}, {'name': 'parcel_type', 'description': '"Land" or "Building"', 'type': 'str'}, {'name': 'zip_code', 'description': 'The zip code of the property', 'type': 'str'}, {'name': 'zoning', 'description': 'The zoning of the property', 'type': 'str'}, {'name': 'owner_1', 'description': 'The first owner of the property', 'type': 'str'}, {'name': 'owner_2', 'description': 'The second owner of the property', 'type': 'str'}, {'name': 'building_code_description', 'description': 'The building code description', 'type': 'str'}, {'name': 'standardized_address', 'description': 'A standardized mailing address', 'type': 'str'}, {'name': 'geometry', 'description': 'The geometry of the property', 'type': 'geometry'}] | [] | https://phl.carto.com/api/v2/sql | | [] | +| vacant_properties | 2025-03-20 01:45:38 | 2025-03-20 01:46:06 | 27.54 seconds | Adds a "vacant" column to the primary feature layer based on vacant property data from ESRI layers and backup data from Google Cloud Storage if necessary. | FeatureLayer: The input feature layer with an added "vacant" column. | Identify vacant properties. | [{'name': 'vacant', 'description': 'Indicates whether the property is vacant.', 'type': 'bool'}] | [] | | - The vacant land data is below the threshold, so backup data is loaded from GCS. | ['opa_id'] | +| pwd_parcels | 2025-03-20 01:46:06 | 2025-03-20 01:46:35 | 28.37 seconds | Updates the primary feature layer by replacing its geometry column with validated geometries from PWD parcels data. Retains point geometry for rows with no polygon geometry available. | FeatureLayer: The updated primary feature layer with geometries replaced by those from PWD parcels or retained from the original layer if no match. | Improve geometry with PWD parcels data. | [] | [{'name': 'geometry', 'description': 'The geometry column is updated with validated geometries from PWD parcels.'}] | https://phl.carto.com/api/v2/sql | | ['opa_id', 'geometry'] | +| council_dists | 2025-03-20 01:46:35 | 2025-03-20 01:46:57 | 21.87 seconds | Associates properties in the primary feature layer with council districts using a spatial join. | FeatureLayer: The input feature layer with properties spatially joined to council districts, ensuring no duplicate entries. | Assigns council districts | [{'name': 'district', 'description': 'The council district associated with the property.', 'type': 'str'}] | [] | | | ['opa_id', 'geometry'] | +| nbhoods | 2025-03-20 01:46:58 | 2025-03-20 01:47:06 | 8.58 seconds | Adds neighborhood information to the primary feature layer by performing a spatial join with a neighborhoods dataset. | FeatureLayer: The input feature layer with an added "neighborhood" column, containing the name of the neighborhood for each property. | Assigns neighborhoods | [{'name': 'neighborhood', 'description': 'The name of the neighborhood associated with the property.', 'type': 'str'}] | [] | https://raw.githubusercontent.com/opendataphilly/open-geo-data/master/philadelphia-neighborhoods/philadelphia-neighborhoods.geojson | | ['opa_id', 'geometry'] | +| rco_geoms | 2025-03-20 01:47:07 | 2025-03-20 01:48:06 | 59.03 seconds | Adds Registered Community Organization (RCO) information to the primary feature layer by performing a spatial join and aggregating RCO data. | FeatureLayer: The input feature layer with added RCO-related columns, including aggregated RCO information and names. | Assigns Community Org Info | [{'name': 'rco_names', 'description': 'Names of RCOs associated with the property.', 'type': 'str'}, {'name': 'rco_info', 'description': 'Additional RCO-related information.', 'type': 'str'}] | [] | "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Zoning_RCO/FeatureServer/0/" | | ['opa_id', 'geometry'] | +| city_owned_properties | 2025-03-20 01:48:06 | 2025-03-20 01:48:25 | 18.64 seconds | Processes city-owned property data by joining it with the primary feature layer, renaming columns, and updating access information for properties based on ownership. All instances where the "city_owner_agency" is "PLB" are changed to "Land Bank (PHDC)". | FeatureLayer: The updated primary feature layer with processed city ownership information. | Categorizes City Owned Properties | [{'name': 'city_owner_agency', 'description': 'The agency that owns the city property.', 'type': 'str'}, {'name': 'side_yard_eligible', 'description': 'Indicates if the property is eligible for the side yard program.', 'type': 'str'}] | [] | https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/LAMAAssets/FeatureServer/0/ | | ['opa_id', 'owner_1', 'owner2'] | +| phs_properties | 2025-03-20 01:48:25 | 2025-03-20 01:48:56 | 30.57 seconds | Perform a spatial join between the primary feature layer and the PHS properties layer, then update the primary feature layer with a new column 'phs_care_program' indicating if the property is part of the PHS care program. | FeatureLayer: The updated primary feature layer with the 'phs_care_program' column. | Identifies PHS Care properties | [{'name': 'phs_care_program', 'description': 'The PHS care program associated with the property.', 'type': 'str'}] | [] | | | ['opa_id', 'geometry'] | +| community_gardens | 2025-03-20 01:48:57 | 2025-03-20 01:48:59 | 2.13 seconds | Updates the 'vacant' column in the primary feature layer to ensure community gardens are marked as not vacant. This protects known community gardens from being categorized as vacant, preventing potential predatory development. | FeatureLayer: The input feature layer with the 'vacant' column updated to False for parcels containing community gardens. | Mark Community Gardens as Not Vacant | [] | [{'name': 'vacant', 'description': 'Updated to False for parcels containing community gardens.'}] | https://services2.arcgis.com/qjOOiLCYeUtwT7x7/arcgis/rest/services/PHS_NGT_Supported_Current_view/FeatureServer/0/ | | ['opa_id', 'vacant'] | +| ppr_properties | 2025-03-20 01:49:00 | 2025-03-20 01:49:11 | 11.08 seconds | Updates the 'vacant' column in the primary feature layer to ensure PPR properties are marked as not vacant. This prevents PPR properties from being miscategorized as vacant. | FeatureLayer: The updated primary feature layer. | Mark Parks as Not Vacant | [] | [{'name': 'vacant', 'description': 'Updated to False for PPR properties.'}] | https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0 | If the Ersi REST URL is not available the function will fall back to loading the data from a GeoJSON URL https://opendata.arcgis.com/datasets/d52445160ab14380a673e5849203eb64_0.geojson | ['opa_id', 'geometry', 'vacant', 'public_name'] | +| owner_type | 2025-03-20 01:49:12 | 2025-03-20 01:49:29 | 17.75 seconds | Determines the ownership type for each property in the primary feature layer based on the 'owner_1', 'owner_2', and 'city_owner_agency' columns. The ownership type is set as: - "Public" if 'city_owner_agency' is not NA. - "Business (LLC)" if 'city_owner_agency' is NA and "LLC" is found in 'owner_1' or 'owner_2'. - "Individual" if 'city_owner_agency' is NA and "LLC" is not found in 'owner_1' or 'owner_2'. | FeatureLayer: The updated feature layer with the 'owner_type' column added. | Assigns ownership types | [{'name': 'owner_type', 'description': 'The ownership type of the property: Public, Business (LLC) or Individual.', 'type': 'str'}] | [] | | | ['opa_id', 'owner_1', 'owner_2', 'city_owner_agency'] | +| li_violations | 2025-03-20 01:49:31 | 2025-03-20 01:49:38 | 7.11 seconds | Process L&I (Licenses and Inspections) data for violations. This function filters and processes L&I violations data, joining it with the primary feature layer based on spatial relationships and OPA (Office of Property Assessment) identifiers. | FeatureLayer: The primary feature layer updated with L&I data. | Counts L&I violations | [{'name': 'all_violations_past_year', 'description': 'Total violations in the past year.', 'type': 'int'}, {'name': 'open_violations_past_year', 'description': 'Open violations in the past year.', 'type': 'int'}] | [] | https://phl.carto.com/api/v2/sql | | ['opa_id'] | +| li_complaints | 2025-03-20 01:49:38 | 2025-03-20 02:00:23 | 645.06 seconds | Applies kernel density estimation (KDE) analysis for L&I complaints to the primary feature layer. | FeatureLayer: The input feature layer with KDE analysis results for L&I complaints, including density and derived metrics. | Analyzes L&I complaint density | [{'name': 'l_and_i_complaints_density', 'description': 'KDE density of complaints.', 'type': 'float'}, {'name': 'l_and_i_complaints_density_zscore', 'description': 'Z-score of complaint density.', 'type': 'float'}, {'name': 'l_and_i_complaints_density_label', 'description': 'Categorized density level.', 'type': 'str'}, {'name': 'l_and_i_complaints_density_percentile', 'description': 'Percentile rank of density.', 'type': 'float'}] | [] | https://phl.carto.com/api/v2/sql | | ['geometry'] | +| tree_canopy | 2025-03-20 02:00:25 | 2025-03-20 02:00:37 | 12.59 seconds | Adds tree canopy gap information to the primary feature layer by downloading, processing, and spatially joining tree canopy data for Philadelphia County. | FeatureLayer: The input feature layer with an added "tree_canopy_gap" column indicating the tree canopy gap for each property. | Measures tree canopy gaps. | [{'name': 'tree_canopy_gap', 'description': 'The amount of tree canopy lacking.', 'type': 'float'}] | [] | https://national-tes-data-share.s3.amazonaws.com/national_tes_share/pa.zip.zip | | ['opa_id', 'geometry'] | +| gun_crimes | 2025-03-20 02:00:38 | 2025-03-20 02:01:12 | 34.15 seconds | Applies kernel density estimation (KDE) analysis for gun crimes to the primary feature layer. | FeatureLayer: The input feature layer with KDE analysis results for gun crimes. | Analyzes gun crime density | [{'name': 'gun_crimes_density', 'description': 'KDE density of gun crimes.', 'type': 'float'}, {'name': 'gun_crimes_density_zscore', 'description': 'Z-score of gun crime density.', 'type': 'float'}, {'name': 'gun_crimes_density_label', 'description': 'Categorized density level.', 'type': 'str'}, {'name': 'gun_crimes_density_percentile', 'description': 'Percentile rank of density.', 'type': 'float'}] | [] | https://phl.carto.com/api/v2/sql | | ['geometry'] | +| drug_crimes | 2025-03-20 02:01:13 | 2025-03-20 02:01:47 | 33.43 seconds | Applies kernel density estimation (KDE) analysis for drug crimes to the primary feature layer. | FeatureLayer: The input feature layer with KDE analysis results for drug crimes. | Density analysis for drug crimes | [{'name': 'drug_crimes_density', 'description': 'KDE density of drug crimes.', 'type': 'float'}, {'name': 'drug_crimes_density_zscore', 'description': 'Z-score of drug crime density.', 'type': 'float'}, {'name': 'drug_crimes_density_label', 'description': 'Categorized density level.', 'type': 'str'}, {'name': 'drug_crimes_density_percentile', 'description': 'Percentile rank of density.', 'type': 'float'}] | [] | https://phl.carto.com/api/v2/sql | | ['geometry'] | +| delinquencies | 2025-03-20 02:01:49 | 2025-03-20 02:01:58 | 9.18 seconds | Adds property tax delinquency information to the primary feature layer by joining with a tax delinquencies dataset. | FeatureLayer: The input feature layer with added columns for tax delinquency information, including total due, actionable status, payment agreements, and more. | Summarize tax delinquencies | [{'name': 'total_due', 'description': 'Total amount owed.', 'type': 'float'}, {'name': 'most_recent_year_owed', 'description': 'Most recent year owed.', 'type': 'str'}, {'name': 'num_years_owed', 'description': 'Number of years owed.', 'type': 'int'}, {'name': 'payment_agreement', 'description': 'Indicates if there is a payment agreement.', 'type': 'str'}, {'name': 'is_actionable', 'description': 'Flag for actionable tax delinquency.', 'type': 'str'}, {'name': 'sheriff_sale', 'description': 'Indicates if the property is at risk of sheriff sale.', 'type': 'str'}, {'name': 'total_assessment', 'description': 'Total property assessment.', 'type': 'float'}] | [] | https://phl.carto.com/api/v2/sql | | ['opa_id'] | +| unsafe_buildings | 2025-03-20 02:02:00 | 2025-03-20 02:02:06 | 5.77 seconds | Adds unsafe building information to the primary feature layer by joining with a dataset of unsafe buildings. | FeatureLayer: The input feature layer with an added "unsafe_building" column, indicating whether each property is categorized as an unsafe building ("Y" or "N"). | Identify unsafe buildings | [{'name': 'unsafe_building', 'description': 'Indicates whether each property is categorized as an unsafe building ("Y" or "N").', 'type': 'str'}] | [] | https://phl.carto.com/api/v2/sql | | ['oda_id'] | +| imm_dang_buildings | 2025-03-20 02:02:06 | 2025-03-20 02:02:11 | 4.83 seconds | Adds information about imminently dangerous buildings to the primary feature layer by joining with a dataset of dangerous buildings. | FeatureLayer: The input feature layer with an added "imm_dang_building" column, indicating whether each property is categorized as imminently dangerous ("Y" or "N"). | Identify imminently dangerous buildings | [{'name': 'imm_dang_building', 'description': 'Indicates whether each property is categorized as imminently dangerous ("Y" or "N").', 'type': 'str'}] | [] | https://phl.carto.com/api/v2/sql | | ['opa_id'] | +| contig_neighbors | 2025-03-20 02:02:12 | 2025-03-20 02:02:14 | 1.66 seconds | Calculates the number of contiguous vacant neighbors for each property in a feature layer. | FeatureLayer: The input feature layer with an added "n_contiguous" column indicating the number of contiguous vacant neighbors for each property. | Count vacant neighbors | [{'name': 'n_contiguous', 'description': 'The number of contiguous vacant neighbors for each property.', 'type': 'int'}] | [] | | | ['opa_id', 'vacant'] | +| dev_probability | 2025-03-20 02:02:15 | 2025-03-20 02:02:31 | 15.98 seconds | Calculates development probability based on permit counts and assigns development ranks to census block groups. The results are joined to the primary feature layer. | FeatureLayer: The input feature layer with added spatial join data for development probability and ranks. | Calculate development probability | [{'name': 'permit_count', 'description': 'The number of permits issued in the census block group.', 'type': 'int'}, {'name': 'dev_rank', 'description': 'The development rank of the census block group.', 'type': 'str'}] | [] | https://phl.carto.com/api/v2/sql | | ['opa_id', 'geometry'] | +| negligent_devs | 2025-03-20 02:02:32 | 2025-03-20 02:02:34 | 2.05 seconds | Identifies negligent developers based on the number of vacant properties owned and flags them in the primary feature layer. | FeatureLayer: The input feature layer with additional columns for total properties owned, vacant properties owned, and a "negligent_dev" flag. | Identify negligent developers | [{'name': 'negligent_dev', 'description': 'non-city owned entities owning 5+ vacant properties', 'type': 'bool'}, {'name': 'n_total_properties_owned', 'description': 'Total number of properties owned by the developer', 'type': 'int'}, {'name': 'n_vacant_properties_owned', 'description': 'Number of vacant properties owned by the developer', 'type': 'int'}] | [] | | | ['opa_id', 'vacant', 'city_owner_agency', 'standardized_address'] | +| tactical_urbanism | 2025-03-20 02:02:34 | 2025-03-20 02:02:50 | 15.94 seconds | Assigns a 'tactical_urbanism' value to each row in the primary feature layer based on specific conditions. Tactical urbanism is marked as "Yes" if the property is a parcel of type 'Land', and does not have any unsafe or immediately dangerous buildings. Otherwise, it is "No". | The input FeatureLayer with a new column 'tactical_urbanism' added to its GeoDataFrame. | Identify tactical urbanism-eligible properties | [{'name': 'tactical_urbanism', 'description': 'Indicates whether each property qualifies for tactical urbanism ("Yes" or "No").', 'type': 'str'}] | [] | | | ['parcel_type', 'unsafe_building', 'imm_dang_building'] | +| conservatorship | 2025-03-20 02:02:53 | 2025-03-20 02:03:47 | 53.97 seconds | Determines conservatorship eligibility for properties in a feature layer. | FeatureLayer: The input feature layer with an added "conservatorship" column indicating whether each property qualifies for conservatorship ("Yes" or "No"). | Identify conservatorship-eligible properties | [{'name': 'conservatorship', 'description': 'Indicates whether each property qualifies for conservatorship ("Yes" or "No").', 'type': 'str'}] | [] | | | ['city_owner_agency', 'sheriff_sale', 'market_value', 'all_violations_past_year', 'sale_date'] | +| park_priority | 2025-03-20 02:03:49 | 2025-03-20 02:04:04 | 14.62 seconds | Downloads and processes park priority data, then joins it with the primary feature layer. | FeatureLayer: The primary feature layer with park priority data joined. | Labels high-priority park areas. | [{'name': 'park_priority', 'description': 'The park priority score.', 'type': 'int'}] | [] | https://www.tpl.org/park-data-downloads | | ['opa_id', 'geometry'] | +| priority_level | 2025-03-20 02:04:04 | 2025-03-20 02:04:25 | 20.22 seconds | Determines priority levels for properties based on gun crime density, violations, tree canopy gaps, and PHS Landcare status. | FeatureLayer: The input feature layer with an added "priority_level" column, indicating the priority for each property as "Low", "Medium", or "High". | Add priority levels | [{'name': 'priority_level', 'description': 'The priority level ( "Low", "Medium", or "High") of the property', 'type': 'str'}] | [] | gun_crimes_density_zscore, all_violations_past_year, l_and_i_complaints_density_zscore, tree_canopy_gap, phs_care_program columns in the primary feature layer. | | [] | +| access_process | 2025-03-20 02:04:27 | 2025-03-20 02:04:45 | 17.99 seconds | Process a dataset to determine the access process for each property based on city ownership and market value. The result is added as a new column in the dataset. | Any: The updated dataset with an additional "access_process" column. | Assigns access processes | [{'name': 'access_process', 'description': 'The access process for each property based on city ownership and market value.', 'type': 'str'}] | [] | | | ['city_owner_agency', 'market_value'] |