diff --git a/.gitignore b/.gitignore
index a4fe2518..618c28c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
## Env files
.env
data.env
+.envrc
## Data
/notebooks/.ipynb_checkpoints/
@@ -32,6 +33,7 @@ data/src/tmp
.DS_Store
*.pem
.vscode
+.python-version
# debug
npm-debug.log*
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..b4fe9d8c
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,48 @@
+# 1.0.0 (2025-03-25)
+
+
+### Bug Fixes
+
+* **572:** fix webpack warnings + fix linting issues ([#574](https://github.com/adamzev/clean-and-green-philly/issues/574)) ([fbfd459](https://github.com/adamzev/clean-and-green-philly/commit/fbfd459ba147e51a2fd54bee61d5ca63fb74eb98))
+* Add focus styling to header links ([a99065c](https://github.com/adamzev/clean-and-green-philly/commit/a99065cba1607b981702e7aa7e654f24c108ad6e))
+* Add href to dropdown items to increase link size ([2eb4667](https://github.com/adamzev/clean-and-green-philly/commit/2eb4667e7b68f45489c51ea10aa7d15afd2a3c77))
+* closing tooltip popup on map closes property details ([432e8c0](https://github.com/adamzev/clean-and-green-philly/commit/432e8c0a14e1e9cfa199b030019c0730a24741c6))
+* correct the scrolling on the map page ([a850237](https://github.com/adamzev/clean-and-green-philly/commit/a8502372e243057e2b06559cb042d94ea894415a))
+* Remove nested link and button ([d93e8bd](https://github.com/adamzev/clean-and-green-philly/commit/d93e8bd52d80cd8d5b1a30910aaa4b168a11b57a))
+* Update footer markup structure for accessibility ([d5c2a33](https://github.com/adamzev/clean-and-green-philly/commit/d5c2a334f0e5c297f117dca5dc2972b56d7b3542))
+
+
+### Features
+
+* **247:** make the mapbox legend more legible ([bcf6915](https://github.com/adamzev/clean-and-green-philly/commit/bcf691566fc26e834547a43efd863de58abdf3d4))
+* **293:** sort features by priority of high, medium, low ([8cd1058](https://github.com/adamzev/clean-and-green-philly/commit/8cd105872feb0065aa3d207d42dc5df76ef63dbc))
+* **305:** expand the header width of single property table ([b25261e](https://github.com/adamzev/clean-and-green-philly/commit/b25261e5d9e26bdbd9694aa03b4964771528dd3b))
+* **395:** add white background to backbutton bar ([a65f052](https://github.com/adamzev/clean-and-green-philly/commit/a65f052ca2e5c0eac2737f85b4997af6e993f750))
+* **395:** make property selection back button sticky ([6601201](https://github.com/adamzev/clean-and-green-philly/commit/6601201fcf943f8e64a1341327095e9b7edc6b08))
+* **457:** add filter count to button ([b35145a](https://github.com/adamzev/clean-and-green-philly/commit/b35145a1df5f2995294f829e0b7cde20ba0e0f29))
+* **535:** add close download view button ([#579](https://github.com/adamzev/clean-and-green-philly/issues/579)) ([9bd2c53](https://github.com/adamzev/clean-and-green-philly/commit/9bd2c53aa3b49da8136fa6e7e850c4dc4d5e2d74))
+* **544:** adjust remove property section ([#580](https://github.com/adamzev/clean-and-green-philly/issues/580)) ([900af8e](https://github.com/adamzev/clean-and-green-philly/commit/900af8e6fbfc89bd4ed83b17c9eed0a1d5a3de3b))
+* **568:** restyle the property table ([#577](https://github.com/adamzev/clean-and-green-philly/issues/577)) ([2c8cc92](https://github.com/adamzev/clean-and-green-philly/commit/2c8cc92b7f28aecdbe65412476691baa50254292))
+* Add arrow icon to link ([6bce3fa](https://github.com/adamzev/clean-and-green-philly/commit/6bce3fa84ad9827f2117a2d51ed2f2665e83481b))
+* Add content to sidebar ([23d5c52](https://github.com/adamzev/clean-and-green-philly/commit/23d5c52c3fcd1eb1eff902e7d1fd37ab2fe41b6d))
+* Add council district ([e576218](https://github.com/adamzev/clean-and-green-philly/commit/e5762185840f0bbc2289fede8481f5b39a16087d))
+* Add extra data to single detail page ([48e34fe](https://github.com/adamzev/clean-and-green-philly/commit/48e34fe0a6317fc8fc0f1f5e745dd165000d437f))
+* Add font ([c1a8b93](https://github.com/adamzev/clean-and-green-philly/commit/c1a8b93561dfc67c8eed31b3c8befa2b8ae984ec))
+* Add font to map ([1491518](https://github.com/adamzev/clean-and-green-philly/commit/14915180857a482a25d96f9f45f7c838082e2d71))
+* Add font to search field ([3b08069](https://github.com/adamzev/clean-and-green-philly/commit/3b080690ba92007545467bc2ce4bf3bcad26c899))
+* Add info cards ([8170944](https://github.com/adamzev/clean-and-green-philly/commit/8170944fdb8d8be36af062012967323047bda748))
+* Add missing data and atlas link; style table ([8c260d4](https://github.com/adamzev/clean-and-green-philly/commit/8c260d48e015ed6c8c29cc10b8936753cc013ff0))
+* Add percentage, update column widths ([41caf77](https://github.com/adamzev/clean-and-green-philly/commit/41caf77c55eb0a6336abc80047ed83584d250a64))
+* Add priority color square ([ad5a9fa](https://github.com/adamzev/clean-and-green-philly/commit/ad5a9faff3b9596256e680b3d6d95c54caefc7a7))
+* add redirect from map to find-properties ([439ed6c](https://github.com/adamzev/clean-and-green-philly/commit/439ed6c7291251aed9751431cd5bdc2fa6883a15))
+* Add row scope ([f945979](https://github.com/adamzev/clean-and-green-philly/commit/f945979edaee2ad5a003a5b5560a89cc25717f3b))
+* Add spacing below Back button ([7de48c5](https://github.com/adamzev/clean-and-green-philly/commit/7de48c50f177c0275cebac0a4a7d9dfce8d2cd2c))
+* Address a few more links ([b3b6733](https://github.com/adamzev/clean-and-green-philly/commit/b3b673375f1472f15d80bffeff378c973d6cb49d))
+* Adjust table per requirements ([36e6cfa](https://github.com/adamzev/clean-and-green-philly/commit/36e6cfa5934defadcf1a8b8ad1c09ffe609d0ec5))
+* Change Find Properties and Learn More to links ([1de0b47](https://github.com/adamzev/clean-and-green-philly/commit/1de0b474811ab2d4281d94c6a9d5882a8ebb0c13))
+* Close sidebar if clicking on empty map ([612117d](https://github.com/adamzev/clean-and-green-philly/commit/612117d6c54bc62baaae4f773013e196232c2dba))
+* Navigate to map on click; show popups ([183b6a2](https://github.com/adamzev/clean-and-green-philly/commit/183b6a224d651726a6e465d7c9549ef437626f9e))
+* Open sidebar from map ([5474367](https://github.com/adamzev/clean-and-green-philly/commit/54743673e99ffd6b939009ad54e4d0a4252ba044))
+* Pass selected property to map view ([663f142](https://github.com/adamzev/clean-and-green-philly/commit/663f14236ccb9246efa8882c0dfa44e8cbcba0f7))
+* underline links ([5f23d09](https://github.com/adamzev/clean-and-green-philly/commit/5f23d09da9a7258b284ccfef54b1be0ee2a8e1f8))
+* Update aria label and role ([765e5a4](https://github.com/adamzev/clean-and-green-philly/commit/765e5a4fef1941e93815ae4789093e047183101c))
diff --git a/data/Dockerfile b/data/Dockerfile
index d5cf8d40..381a4da0 100644
--- a/data/Dockerfile
+++ b/data/Dockerfile
@@ -57,4 +57,4 @@ WORKDIR /usr/src/app
# Use Pipenv to run the script
# Adjust the path to your main Python script if needed
-CMD ["pipenv", "run", "python", "./script.py"]
+CMD ["pipenv", "run", "python", "./main.py"]
diff --git a/data/Dockerfile-pg b/data/Dockerfile-pg
deleted file mode 100644
index d4d62d77..00000000
--- a/data/Dockerfile-pg
+++ /dev/null
@@ -1,26 +0,0 @@
-#
-# NOTE: THIS DOCKERFILE IS GENERATED VIA "make update"! PLEASE DO NOT EDIT IT DIRECTLY.
-#
-
-FROM postgres:16-bullseye
-
-LABEL maintainer="PostGIS Project - https://postgis.net" \
- org.opencontainers.image.description="PostGIS 3.4.3+dfsg-2.pgdg110+1 spatial database extension with PostgreSQL 16 bullseye" \
- org.opencontainers.image.source="https://github.com/postgis/docker-postgis"
-
-ENV POSTGIS_MAJOR 3
-ENV POSTGIS_VERSION 3.4.3+dfsg-2.pgdg110+1
-
-RUN apt-get update \
- && apt-cache showpkg postgresql-$PG_MAJOR-postgis-$POSTGIS_MAJOR \
- && apt-get install -y --no-install-recommends \
- # ca-certificates: for accessing remote raster files;
- # fix: https://github.com/postgis/docker-postgis/issues/307
- ca-certificates \
- \
- postgresql-$PG_MAJOR-postgis-$POSTGIS_MAJOR=$POSTGIS_VERSION \
- postgresql-$PG_MAJOR-postgis-$POSTGIS_MAJOR-scripts \
- && rm -rf /var/lib/apt/lists/*
-
-RUN mkdir -p /docker-entrypoint-initdb.d
-
diff --git a/data/docker-compose.yml b/data/docker-compose.yml
index 5737d39a..11c169e9 100644
--- a/data/docker-compose.yml
+++ b/data/docker-compose.yml
@@ -55,25 +55,6 @@ services:
- host.docker.internal:host-gateway
network_mode: 'host'
- postgres:
- container_name: cagp-postgres
- build:
- context: .
- dockerfile: Dockerfile-pg
- environment:
- PGPORT: 5433
- POSTGRES_PASSWORD:
- restart: always
- ports:
- - '5433:5433'
- volumes:
- - database_volume:/var/lib/postgresql/data
- - ./init_pg.sql:/docker-entrypoint-initdb.d/init_pg.sql
- - /etc/timezone:/etc/timezone:ro
- - /etc/localtime:/etc/localtime:ro
- extra_hosts:
- - host.docker.internal:host-gateway
-
postgres-timescale:
container_name: cagp-postgres-timescale
build:
@@ -93,5 +74,4 @@ services:
extra_hosts:
- host.docker.internal:host-gateway
volumes:
- database_volume:
timescale_database_volume:
diff --git a/data/src/classes/__init__.py b/data/src/classes/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/data/src/classes/backup_archive_database.py b/data/src/classes/backup_archive_database.py
deleted file mode 100644
index 60f5ef4f..00000000
--- a/data/src/classes/backup_archive_database.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import logging as log
-import os
-import subprocess
-from datetime import datetime, timedelta
-
-import sqlalchemy as sa
-from config.config import log_level, max_backup_schema_days, tiles_file_id_prefix, tile_file_backup_directory
-from config.psql import conn, local_engine, url
-from data_utils.utils import mask_password
-from sqlalchemy import inspect
-from classes.featurelayer import google_cloud_bucket
-
-log.basicConfig(level=log_level)
-
-backup_schema_name: str = "backup_"
-""" the prefix for the backup schemas """
-
-date_time_format: str = "%Y_%m_%dt%H_%M_%S"
-""" the datetime format for the backup schema names """
-
-
-class BackupArchiveDatabase:
- """
- Class to manage creating a backup of the public schema before the etl refresh is run. After the etl job and data differences are reported, this class moves the current backup schema to a timestamped backup and prunes older backup schemas.
- """
-
- def __init__(self):
- self.timestamp_string = datetime.now().strftime(date_time_format)
- self.backup_schema_archive_name = backup_schema_name + self.timestamp_string
-
- def backup_schema(self):
- """
- backup the whole public schema to another schema in the same db.
- pgdump the public schema, replace public schema name with backup schema name, clean up the special column types, and import it with psql in one piped command
- """
-
- pgdump_command = (
- # first, dump the schema only where we can safely replace all 'public' strings with 'backup_'
- "pg_dump "
- + url
- + " -s --schema public | "
- + " sed 's/public/" + backup_schema_name + "/g'"
- + " | sed 's/"
- + backup_schema_name
- + ".geometry/public.geometry/' | sed 's/"
- + backup_schema_name
- + ".spatial_ref_sys/public.spatial_ref_sys/'"
- + " | sed 's/backup__/public_/g'" # ppr_properties.public_name column needs to be restored.
- + " | psql -v ON_ERROR_STOP=1 "
- + url
- + " > /dev/null "
- # then dump the data only and substitute the word public only where it is in DDL, not in the data
- + " && pg_dump "
- + url
- + " -a --schema public | sed 's/COPY public./COPY "
- + backup_schema_name
- + "./g' | sed 's/"
- + backup_schema_name
- + ".geometry/public.geometry/' | sed 's/"
- + backup_schema_name
- + ".spatial_ref_sys/public.spatial_ref_sys/' | psql -v ON_ERROR_STOP=1 "
- + url
- )
- log.debug(mask_password(pgdump_command))
- complete_process = subprocess.run(
- pgdump_command,
- check=False,
- shell=True,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- )
-
- if complete_process.returncode != 0 or complete_process.stderr:
- raise RuntimeError(
- "pg_dump command "
- + mask_password(pgdump_command)
- + " did not exit with success. "
- + complete_process.stderr
- )
-
- def archive_backup_schema(self):
- """
- mv backup_ schema to "backup_" + backup_timestamp
- """
- sql = (
- "ALTER SCHEMA "
- + backup_schema_name
- + " RENAME TO "
- + self.backup_schema_archive_name
- )
- log.debug(sql)
- conn.execute(sa.DDL(sql))
-
- def prune_old_archives(self):
- """
- drop backup schemas that are too old
- """
- # list all backup schemas
- schemas = inspect(local_engine).get_schema_names()
- cutoff = datetime.now() - timedelta(days=max_backup_schema_days)
- for schema in schemas:
- if schema.startswith(backup_schema_name):
- timestamp = schema.replace(backup_schema_name, "")
- backed_up_time = datetime.strptime(timestamp, date_time_format)
- if backed_up_time < cutoff:
- sql = "drop schema " + schema + " cascade"
- log.debug(sql)
- conn.execute(sa.DDL(sql))
-
- def is_backup_schema_exists(self) -> bool:
- """ whether the backup schema exists
-
- Returns:
- bool: whether true
- """
- return backup_schema_name in inspect(local_engine).get_schema_names()
-
- def backup_tiles_file(self):
- """backup the main tiles file to a timestamped copy in the backup/ folder in GCP
- """
- bucket = google_cloud_bucket()
- count: int = 0
- for blob in bucket.list_blobs(prefix=tiles_file_id_prefix):
- suffix: str = '_' + self.timestamp_string
- name, ext = os.path.splitext(blob.name)
- backup_file_name: str = tile_file_backup_directory + "/" + name + suffix + ext
- log.debug(backup_file_name)
- bucket.copy_blob(blob,destination_bucket=bucket,new_name=backup_file_name)
- count += 1
- if count == 0:
- log.warning("No files were found to back up.")
diff --git a/data/src/classes/diff_report.py b/data/src/classes/diff_report.py
deleted file mode 100644
index 2ac7395f..00000000
--- a/data/src/classes/diff_report.py
+++ /dev/null
@@ -1,225 +0,0 @@
-import logging as log
-import os
-import re
-import smtplib
-import subprocess
-from email.mime.text import MIMEText
-
-from classes.backup_archive_database import backup_schema_name
-from classes.featurelayer import google_cloud_bucket
-from config.config import (
- from_email,
- log_level,
- report_to_email,
- report_to_slack_channel,
- smtp_server,
-)
-from config.psql import conn, url
-from data_utils.utils import mask_password
-from slack_sdk import WebClient
-
-log.basicConfig(level=log_level)
-
-class DiffTable:
- """Metadata about a table to be run through data-diff
- """
- def __init__(self, table: str, pk_cols: list[str], where: str = None):
- """constructor
-
- Args:
- table (str): the name of the table in postgres
- pk_cols (list[str]): the list of columns in the primary key
- where (str, optional): any additional where clause to limit the rows being compared
- """
- self.table = table
- self.pk_cols = pk_cols
- self.where = where
-
-class DiffReport:
- """
- Class to manage computing data differences for all tables between the newly imported schema and the last schema. Build a report of summary differences for all tables. Log detailed differences to a table in the old backed-up schema. Post difference summary to Slack and or email.
- """
-
- def __init__(self, timestamp_string: str = None):
- """constructor
-
- Args:
- timestamp_string (str, optional): This should be the same timestamp used in the backup to keep things consistent. We only use this as the folder name for the diff detail files in GCP. Defaults to None.
- """
- self.diff_tables = self._list_diff_tables()
- self.timestamp_string = timestamp_string
- self.report: str = "The back-end data has been fully refreshed. Here is the difference report on " + str(len(self.diff_tables)) + " key tables.\nLegend: table A = new data, table B = old data.\n\n"
-
- def run(self):
- """
- run the report and slack or email it.
- """
-
- for diff_table in self.diff_tables:
- log.debug("Process table %s with pks %s", diff_table.table, str(diff_table.pk_cols))
- summary = diff_table.table + "\n" + self.compare_table(diff_table)
- # if no differences, do not report.
- if self._summary_shows_differences(summary):
- self.report += summary
- self.report += "Details: " + self.detail_report(diff_table.table) + "\n"
- else:
- self.report += diff_table.table + "\nNo difference\n"
- self.report += "\n"
- log.debug("\n")
- log.debug(self.report)
- self.send_report_to_slack()
- self.email_report()
-
- def _summary_shows_differences(self, summary: str) -> bool:
- """check the data-diff summary report to see if there are any differences
-
- Args:
- summary (str): the summary output
-
- Returns:
- bool: whether any add, deletes or updates were reported
- """
- return not (
- "0 rows exclusive to table A" in summary
- and "0 rows exclusive to table B" in summary
- and "0 rows updated" in summary
- )
-
- def detail_report(self, table: str) -> str:
- """Generate the html from the detail diff report and upload to Google cloud as an html file
- Args:
- table (str): the name of the core table being compared
-
- Returns:
- str: the full url of the report
- """
- return self._save_detail_report_to_cloud(
- self.generate_table_detail_report(table), table
- )
-
- def _save_detail_report_to_cloud(self, html: str, table: str) -> str:
- """Save this html to a public cloud folder in Google named with the timestamp of the backup
-
- Args:
- html (str): the html content
- table (str): the name of the core table being compared
-
- Returns:
- str: the full url of the report
- """
- path: str = "diff/" + self.timestamp_string + "/" + table + ".html"
- bucket = google_cloud_bucket()
- blob = bucket.blob(path)
- blob.upload_from_string(html, content_type="text/html")
- return "https://storage.googleapis.com/" + bucket.name + "/" + path
-
- def generate_table_detail_report(self, table: str) -> str:
- """
- generate an html table of the details of differences in this table from the materialized diff table in the backup schema from data-diff
- """
- sql: str = "select * from " + backup_schema_name + "." + table + "_diff"
- cur = conn.connection.cursor()
- cur.execute(sql)
- html: str = "
"
-
- column_names = [desc[0] for desc in cur.description]
- for column in column_names:
- html += "| " + column + " | "
- html += "
"
- for row in cur.fetchall():
- html += ""
- for value in row:
- html += "| " + str(value) + " | "
- html += "
"
- html += "
"
- return html
-
- def _list_diff_tables(self) -> list[DiffTable]:
- """
- list table metadata to do the diff on
- Returns:
- list[DiffTable]: the list of metadata
- """
- return [
- DiffTable(table="vacant_properties",pk_cols=["opa_id", "parcel_type"],where="opa_id is not null"),
- DiffTable(table="li_complaints",pk_cols=["service_request_id"]),
- DiffTable(table="li_violations",pk_cols=["violationnumber", "opa_account_num"],where="opa_account_num is not null"),
- DiffTable(table="opa_properties",pk_cols=["parcel_number"]),
- DiffTable(table="property_tax_delinquencies",pk_cols=["opa_number"],where="opa_number <> 0")
- ]
-
- def compare_table(self, diff_table: DiffTable) -> str:
- """
- run data-diff to compare the newly imported table in the public schema to the table in the backup schema.
- We could use the data-diff python API but the cl is much clearer and has features I could not find in the API.
- """
- table = diff_table.table
- pks = diff_table.pk_cols
-
- # compare the tables and output the summary stats to include in the report. Materialize the details
- # of the differences to a table in the backup schema named table_name_diff
- data_diff_command = (
- "data-diff "
- + url
- + " public."
- + table
- + " "
- + backup_schema_name
- + "."
- + table
- + " -k "
- + " -k ".join(pks)
- + " -c '%' -m "
- + backup_schema_name
- + "."
- + table
- + "_diff --stats"
- + " --table-write-limit 100000"
- + (" -w '" + diff_table.where + "'" if diff_table.where else "")
- )
- log.debug(mask_password(data_diff_command))
-
- complete_process = subprocess.run(
- data_diff_command, check=False, shell=True, capture_output=True
- )
-
- if complete_process.returncode != 0 or complete_process.stderr:
- raise RuntimeError(
- "data-diff command did not exit with success. "
- + complete_process.stderr.decode()
- )
- output = complete_process.stdout.decode()
- return re.sub(r"\nExtra-Info:.*", "", output, flags=re.DOTALL)
-
- def send_report_to_slack(self):
- """
- post the summary report to the slack channel if configured.
- """
- if report_to_slack_channel:
- token = os.environ["CAGP_SLACK_API_TOKEN"]
- client = WebClient(token=token)
-
- # Send a message
- client.chat_postMessage(
- channel=report_to_slack_channel,
- text=self.report,
- username="CAGP Diff Bot",
- )
-
- def email_report(self):
- """
- email the summary report if configured.
- """
- if report_to_email:
- # Create a text/plain message
- msg = MIMEText(self.report)
- msg["Subject"] = "Clean & Green Philly: Data difference report"
- msg["From"] = from_email
- msg["To"] = report_to_email
-
- # Send the message via our own SMTP server
- s = smtplib.SMTP(smtp_server)
- s.sendmail(from_email, [report_to_email], msg.as_string())
- s.quit()
-
-
diff --git a/data/src/classes/featurelayer.py b/data/src/classes/featurelayer.py
deleted file mode 100644
index f3eff73d..00000000
--- a/data/src/classes/featurelayer.py
+++ /dev/null
@@ -1,409 +0,0 @@
-import logging as log
-import os
-import subprocess
-import traceback
-
-import geopandas as gpd
-import pandas as pd
-import requests
-import sqlalchemy as sa
-from config.config import (
- FORCE_RELOAD,
- USE_CRS,
- log_level,
- min_tiles_file_size_in_bytes,
- write_production_tiles_file,
-)
-from config.psql import conn, local_engine
-from esridump.dumper import EsriDumper
-from google.cloud import storage
-from google.cloud.storage.bucket import Bucket
-from shapely import Point, wkb
-
-log.basicConfig(level=log_level)
-
-
-def google_cloud_bucket() -> Bucket:
- """Build the google cloud bucket with name configured in your environ or default of cleanandgreenphl
-
- Returns:
- Bucket: the gcp bucket
- """
- credentials_path = os.path.expanduser("/app/service-account-key.json")
-
- if not os.path.exists(credentials_path):
- raise FileNotFoundError(f"Credentials file not found at {credentials_path}")
-
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
- bucket_name = os.getenv("GOOGLE_CLOUD_BUCKET_NAME", "cleanandgreenphl")
- project_name = os.getenv("GOOGLE_CLOUD_PROJECT", "clean-and-green-philly")
-
- storage_client = storage.Client(project=project_name)
- return storage_client.bucket(bucket_name)
-
-
-
-
-class FeatureLayer:
- """
- FeatureLayer is a class to represent a GIS dataset. It can be initialized with a URL to an Esri Feature Service, a SQL query to Carto, or a GeoDataFrame.
- """
-
- def __init__(
- self,
- name,
- esri_rest_urls=None,
- carto_sql_queries=None,
- gdf=None,
- crs=USE_CRS,
- force_reload=FORCE_RELOAD,
- from_xy=False,
- use_wkb_geom_field=None,
- cols: list[str] = None,
- bucket: Bucket = None
- ):
- self.name = name
- self.esri_rest_urls = (
- [esri_rest_urls] if isinstance(esri_rest_urls, str) else esri_rest_urls
- )
- self.carto_sql_queries = (
- [carto_sql_queries]
- if isinstance(carto_sql_queries, str)
- else carto_sql_queries
- )
- self.gdf = gdf
- self.crs = crs
- self.cols = cols
- self.psql_table = name.lower().replace(" ", "_")
- self.input_crs = "EPSG:4326" if not from_xy else USE_CRS
- self.use_wkb_geom_field = use_wkb_geom_field
- self.bucket = bucket or google_cloud_bucket()
-
- inputs = [self.esri_rest_urls, self.carto_sql_queries, self.gdf]
- non_none_inputs = [i for i in inputs if i is not None]
-
- if len(non_none_inputs) > 0:
- if self.esri_rest_urls is not None:
- self.type = "esri"
- elif self.carto_sql_queries is not None:
- self.type = "carto"
- elif self.gdf is not None:
- self.type = "gdf"
-
- if force_reload:
- self.load_data()
- else:
- psql_exists = self.check_psql()
- if not psql_exists:
- self.load_data()
- else:
- print(f"Initialized FeatureLayer {self.name} with no data.")
-
- def check_psql(self):
- try:
- if not sa.inspect(local_engine).has_table(self.psql_table):
- print(f"Table {self.psql_table} does not exist")
- return False
- psql_table = gpd.read_postgis(
- f"SELECT * FROM {self.psql_table}", conn, geom_col="geometry"
- )
- if len(psql_table) == 0:
- return False
- else:
- print(f"Loading data for {self.name} from psql...")
- self.gdf = psql_table
- return True
- except Exception as e:
- print(f"Error loading data for {self.name}: {e}")
- return False
-
- def load_data(self):
- print(f"Loading data for {self.name} from {self.type}...")
- if self.type == "gdf":
- pass
- else:
- try:
- if self.type == "esri":
- if self.esri_rest_urls is None:
- raise ValueError("Must provide a URL to load data from Esri")
-
- gdfs = []
- for url in self.esri_rest_urls:
- parcel_type = (
- "Land"
- if "Vacant_Indicators_Land" in url
- else "Building"
- if "Vacant_Indicators_Bldg" in url
- else None
- )
- self.dumper = EsriDumper(url)
- features = [feature for feature in self.dumper]
-
- geojson_features = {
- "type": "FeatureCollection",
- "features": features,
- }
-
- this_gdf = gpd.GeoDataFrame.from_features(
- geojson_features, crs=self.input_crs
- )
-
- # Check if 'X' and 'Y' columns exist and create geometry if necessary
- if "X" in this_gdf.columns and "Y" in this_gdf.columns:
- this_gdf["geometry"] = this_gdf.apply(
- lambda row: Point(row["X"], row["Y"]), axis=1
- )
- elif "geometry" not in this_gdf.columns:
- raise ValueError(
- "No geometry information found in the data."
- )
-
- this_gdf = this_gdf.to_crs(USE_CRS)
-
- # Assign the parcel_type to the GeoDataFrame
- if parcel_type:
- this_gdf["parcel_type"] = parcel_type
-
- gdfs.append(this_gdf)
-
- self.gdf = pd.concat(gdfs, ignore_index=True)
-
- elif self.type == "carto":
- if self.carto_sql_queries is None:
- raise ValueError(
- "Must provide a SQL query to load data from Carto"
- )
-
- gdfs = []
- for sql_query in self.carto_sql_queries:
- response = requests.get(
- "https://phl.carto.com/api/v2/sql", params={"q": sql_query}
- )
-
- data = response.json()["rows"]
- df = pd.DataFrame(data)
- geometry = (
- wkb.loads(df[self.use_wkb_geom_field], hex=True)
- if self.use_wkb_geom_field
- else gpd.points_from_xy(df.x, df.y)
- )
-
- gdf = gpd.GeoDataFrame(
- df,
- geometry=geometry,
- crs=self.input_crs,
- )
- gdf = gdf.to_crs(USE_CRS)
-
- gdfs.append(gdf)
- self.gdf = pd.concat(gdfs, ignore_index=True)
-
- # Drop columns
- if self.cols:
- self.cols.append("geometry")
- self.gdf = self.gdf[self.cols]
-
- # save self.gdf to psql
- # rename columns to lowercase for table creation in postgres
- if self.cols:
- self.gdf = self.gdf.rename(
- columns={x: x.lower() for x in self.cols}
- )
- self.gdf.to_postgis(
- name=self.psql_table,
- con=conn,
- if_exists="replace",
- chunksize=1000,
- )
- except Exception as e:
- print(f"Error loading data for {self.name}: {e}")
- traceback.print_exc()
- self.gdf = None
-
- def spatial_join(self, other_layer, how="left", predicate="intersects"):
- """
- Spatial joins in this script are generally left intersect joins.
- They also can create duplicates, so we drop duplicates after the join.
- Note: We may want to revisit the duplicates.
- """
-
- # If other_layer.gdf isn't a geodataframe, try to make it one
- if not isinstance(other_layer.gdf, gpd.GeoDataFrame):
- try:
- other_layer.rebuild_gdf()
-
- except Exception as e:
- print(f"Error converting other_layer to GeoDataFrame: {e}")
- return
-
- self.gdf = gpd.sjoin(self.gdf, other_layer.gdf, how=how, predicate=predicate)
- self.gdf.drop(columns=["index_right"], inplace=True)
- self.gdf.drop_duplicates(inplace=True)
-
- # Coerce opa_id to integer and drop rows where opa_id is null or non-numeric
- self.gdf.loc[:, "opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
- self.gdf = self.gdf.dropna(subset=["opa_id"])
-
- def opa_join(self, other_df, opa_column):
- """
- Join 2 dataframes based on opa_id and keep the 'geometry' column from the left dataframe if it exists in both.
- """
-
- # Coerce opa_column to integer and drop rows where opa_column is null or non-numeric
- other_df.loc[:, opa_column] = pd.to_numeric(
- other_df[opa_column], errors="coerce"
- )
- other_df = other_df.dropna(subset=[opa_column])
-
- # Coerce opa_id to integer and drop rows where opa_id is null or non-numeric
- self.gdf.loc[:, "opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
- self.gdf = self.gdf.dropna(subset=["opa_id"])
-
- # Perform the merge
- joined = self.gdf.merge(
- other_df, how="left", right_on=opa_column, left_on="opa_id"
- )
-
- # Check if 'geometry' column exists in both dataframes and clean up
- if "geometry_x" in joined.columns and "geometry_y" in joined.columns:
- joined = joined.drop(columns=["geometry_y"]).copy() # Ensure a full copy
- joined = joined.rename(columns={"geometry_x": "geometry"})
-
- if opa_column != "opa_id":
- joined = joined.drop(columns=[opa_column])
-
- # Assign the joined DataFrame to self.gdf as a full copy
- self.gdf = joined.copy()
- self.rebuild_gdf()
-
- def rebuild_gdf(self):
- self.gdf = gpd.GeoDataFrame(self.gdf, geometry="geometry", crs=self.crs)
-
- def create_centroid_gdf(self):
- """
- Convert the geometry of the GeoDataFrame to centroids.
- """
- self.centroid_gdf = self.gdf.copy()
- self.centroid_gdf.loc[:, "geometry"] = self.gdf["geometry"].centroid
-
- def build_and_publish(self, tiles_file_id_prefix: str) -> None:
- """
- Builds PMTiles and a Parquet file from a GeoDataFrame and publishes them to Google Cloud Storage.
-
- Args:
- tiles_file_id_prefix (str): The ID prefix used for naming the PMTiles and Parquet files, coming from config.
-
- Raises:
- ValueError: Raised if the generated PMTiles file is smaller than the minimum allowed size, indicating a potential corruption or incomplete file.
- """
- zoom_threshold: int = 13
-
- # Export the GeoDataFrame to a temporary GeoJSON file
- temp_geojson_points: str = f"tmp/temp_{tiles_file_id_prefix}_points.geojson"
- temp_geojson_polygons: str = f"tmp/temp_{tiles_file_id_prefix}_polygons.geojson"
- temp_pmtiles_points: str = f"tmp/temp_{tiles_file_id_prefix}_points.pmtiles"
- temp_pmtiles_polygons: str = f"tmp/temp_{tiles_file_id_prefix}_polygons.pmtiles"
- temp_merged_pmtiles: str = f"tmp/temp_{tiles_file_id_prefix}_merged.pmtiles"
- temp_parquet: str = f"tmp/{tiles_file_id_prefix}.parquet"
-
- # Reproject
- gdf_wm = self.gdf.to_crs(epsg=4326)
- gdf_wm.to_file(temp_geojson_polygons, driver="GeoJSON")
-
- # Create points dataset
- self.centroid_gdf = self.gdf.copy()
- self.centroid_gdf["geometry"] = self.centroid_gdf["geometry"].centroid
- self.centroid_gdf = self.centroid_gdf.to_crs(epsg=4326)
- self.centroid_gdf.to_file(temp_geojson_points, driver="GeoJSON")
-
- # Load the GeoJSON from the polygons, drop geometry, and save as Parquet
- gdf_polygons = gpd.read_file(temp_geojson_polygons)
- df_no_geom = gdf_polygons.drop(columns=["geometry"])
-
- # Check if the DataFrame has fewer than 25,000 rows
- num_rows, num_cols = df_no_geom.shape
- if num_rows < 25000:
- print(
- f"Parquet file has {num_rows} rows, which is fewer than 25,000. Skipping upload."
- )
- return
-
- # Save the DataFrame as Parquet
- df_no_geom.to_parquet(temp_parquet)
-
- # Upload Parquet to Google Cloud Storage
- blob_parquet = self.bucket.blob(f"{tiles_file_id_prefix}.parquet")
- try:
- blob_parquet.upload_from_filename(temp_parquet)
- parquet_size = os.stat(temp_parquet).st_size
- parquet_size_mb = parquet_size / (1024 * 1024)
- print(
- f"Parquet upload successful! Size: {parquet_size} bytes ({parquet_size_mb:.2f} MB), Dimensions: {num_rows} rows, {num_cols} columns."
- )
- except Exception as e:
- print(f"Parquet upload failed: {e}")
- return
-
- # Command for generating PMTiles for points up to zoom level zoom_threshold
- points_command: list[str] = [
- "tippecanoe",
- f"--output={temp_pmtiles_points}",
- f"--maximum-zoom={zoom_threshold}",
- "--minimum-zoom=10",
- "-zg",
- "-aC",
- "-r0",
- temp_geojson_points,
- "-l",
- "vacant_properties_tiles_points",
- "--force",
- ]
-
- # Command for generating PMTiles for polygons from zoom level zoom_threshold
- polygons_command: list[str] = [
- "tippecanoe",
- f"--output={temp_pmtiles_polygons}",
- f"--minimum-zoom={zoom_threshold}",
- "--maximum-zoom=16",
- "-zg",
- "--no-tile-size-limit",
- temp_geojson_polygons,
- "-l",
- "vacant_properties_tiles_polygons",
- "--force",
- ]
-
- # Command for merging the two PMTiles files into a single output file
- merge_command: list[str] = [
- "tile-join",
- f"--output={temp_merged_pmtiles}",
- "--no-tile-size-limit",
- temp_pmtiles_polygons,
- temp_pmtiles_points,
- "--force",
- ]
-
- # Run the commands
- for command in [points_command, polygons_command, merge_command]:
- subprocess.run(command)
-
- write_files: list[str] = [f"{tiles_file_id_prefix}_staging.pmtiles"]
-
- if write_production_tiles_file:
- write_files.append(f"{tiles_file_id_prefix}.pmtiles")
-
- # Check whether the temp saved tiles files is big enough.
- file_size: int = os.stat(temp_merged_pmtiles).st_size
- if file_size < min_tiles_file_size_in_bytes:
- raise ValueError(
- f"{temp_merged_pmtiles} is {file_size} bytes in size but should be at least {min_tiles_file_size_in_bytes}. Therefore, we are not uploading any files to the GCP bucket. The file may be corrupt or incomplete."
- )
-
- # Upload PMTiles to Google Cloud Storage
- for file in write_files:
- blob = self.bucket.blob(file)
- try:
- blob.upload_from_filename(temp_merged_pmtiles)
- print(f"PMTiles upload successful for {file}!")
- except Exception as e:
- print(f"PMTiles upload failed for {file}: {e}")
\ No newline at end of file
diff --git a/data/src/classes/slack_error_reporter.py b/data/src/classes/slack_error_reporter.py
deleted file mode 100644
index 1c443d4b..00000000
--- a/data/src/classes/slack_error_reporter.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import os
-from slack_sdk import WebClient
-
-
-def send_error_to_slack(error_message: str) -> None:
- """Send error message to Slack."""
- token: str | None = os.getenv("CAGP_SLACK_API_TOKEN") # token can be None
- if token:
- client = WebClient(token=token)
- client.chat_postMessage(
- channel="clean-and-green-philly-back-end", # Replace with actual Slack channel ID
- text=error_message,
- username="Backend Error Reporter",
- )
- else:
- raise ValueError("Slack API token not found in environment variables.")
diff --git a/data/src/config/config.py b/data/src/config/config.py
index 7b7fa01c..4b240cf2 100644
--- a/data/src/config/config.py
+++ b/data/src/config/config.py
@@ -1,5 +1,4 @@
import logging
-import os
from pathlib import Path
FORCE_RELOAD = False
diff --git a/data/src/constants/__init__.py b/data/src/constants/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/data/src/constants/services.py b/data/src/constants/services.py
deleted file mode 100644
index dcc4daac..00000000
--- a/data/src/constants/services.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import datetime
-
-VACANT_PROPS_LAYERS_TO_LOAD = [
- "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Vacant_Indicators_Land/FeatureServer/0/",
- "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Vacant_Indicators_Bldg/FeatureServer/0/",
-]
-
-COUNCIL_DISTRICTS_TO_LOAD = [
- "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/Council_Districts_2024/FeatureServer/0/"
-]
-
-CITY_OWNED_PROPERTIES_TO_LOAD = [
- "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/LAMAAssets/FeatureServer/0/"
-]
-
-PHS_LAYERS_TO_LOAD = [
- "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/phs_landcare/FeatureServer/0",
-]
-
-RCOS_LAYERS_TO_LOAD = [
- "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Zoning_RCO/FeatureServer/0/"
-]
-
-COMMUNITY_GARDENS_TO_LOAD = [
- "https://services2.arcgis.com/qjOOiLCYeUtwT7x7/arcgis/rest/services/PHS_NGT_Supported_Current_view/FeatureServer/0/"
-]
-
-PPR_PROPERTIES_TO_LOAD = [
- "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0"
-]
-
-one_year_ago = (datetime.datetime.now() - datetime.timedelta(days=365)).strftime(
- "%Y-%m-%d"
-)
-
-# Load data for complaints from L&I
-COMPLAINTS_SQL_QUERY = f"SELECT address, service_request_id, subject, status, service_name, service_code, lat AS y, lon AS x FROM public_cases_fc WHERE requested_datetime >= '{one_year_ago}' AND lat IS NOT NULL"
-
-VIOLATIONS_SQL_QUERY = f"SELECT parcel_id_num, casenumber, casecreateddate, casetype, casestatus, violationnumber, violationcodetitle, violationstatus, opa_account_num, address, opa_owner, geocode_x AS x, geocode_y AS y FROM violations WHERE violationdate >= '{one_year_ago}' AND geocode_x IS NOT NULL"
-
-GUNCRIME_SQL_QUERY = f"SELECT text_general_code, dispatch_date, point_x AS x, point_y AS y FROM incidents_part1_part2 WHERE dispatch_date_time >= '{one_year_ago}' AND text_general_code IN ('Aggravated Assault Firearm', 'Robbery Firearm') AND point_x IS NOT NULL"
-
-DRUGCRIME_SQL_QUERY = f"SELECT text_general_code, dispatch_date, point_x AS x, point_y AS y FROM incidents_part1_part2 WHERE dispatch_date_time >= '{one_year_ago}' AND text_general_code IN ('Narcotic / Drug Law Violations') AND point_x IS NOT NULL"
-
-DELINQUENCIES_QUERY = "SELECT * FROM real_estate_tax_delinquencies"
-
-OPA_PROPERTIES_QUERY = "SELECT building_code_description, market_value, sale_date, sale_price, parcel_number, owner_1, owner_2, mailing_address_1, mailing_address_2, mailing_care_of, mailing_street, mailing_zip, mailing_city_state, zip_code, zoning, the_geom FROM opa_properties_public"
-
-PWD_PARCELS_QUERY = "SELECT *, the_geom FROM pwd_parcels"
-
-UNSAFE_BUILDINGS_QUERY = "SELECT * FROM unsafe"
-
-IMMINENT_DANGER_BUILDINGS_QUERY = "SELECT * FROM imm_dang"
-
-PERMITS_QUERY = f"""
- SELECT
- address,
- addressobjectid,
- approvedscopeofwork,
- commercialorresidential,
- opa_account_num,
- permittype,
- status,
- unit_num,
- unit_type,
- permitissuedate,
- typeofwork,
- the_geom,
- ST_AsGeoJSON(the_geom)::json AS the_geom_geojson
- FROM permits
- WHERE permitissuedate >= '{one_year_ago}'
- """
-
-NBHOODS_URL = "https://raw.githubusercontent.com/opendataphilly/open-geo-data/master/philadelphia-neighborhoods/philadelphia-neighborhoods.geojson"
-
-CENSUS_BGS_URL = (
- "https://opendata.arcgis.com/datasets/2f982bada233478ea0100528227febce_0.geojson"
-)
\ No newline at end of file
diff --git a/data/src/data_utils/__init__.py b/data/src/data_utils/__init__.py
deleted file mode 100644
index e1709a69..00000000
--- a/data/src/data_utils/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from .city_owned_properties import city_owned_properties
-from .phs_properties import phs_properties
-from .l_and_i import l_and_i
-from .rco_geoms import rco_geoms
-from .tree_canopy import tree_canopy
-from .nbhoods import nbhoods
-from .gun_crimes import gun_crimes
-from .deliquencies import deliquencies
-from .opa_properties import opa_properties
-from .vacant_properties import vacant_properties
-from .priority_level import priority_level
-from .access_process import access_process
-
-__all__ = [
- "city_owned_properties",
- "phs_properties",
- "l_and_i",
- "rco_geoms",
- "tree_canopy",
- "nbhoods",
- "gun_crimes",
- "deliquencies",
- "opa_properties",
- "vacant_properties",
- "priority_level",
- "access_process",
-]
diff --git a/data/src/data_utils/access_process.py b/data/src/data_utils/access_process.py
deleted file mode 100644
index 7c8e79de..00000000
--- a/data/src/data_utils/access_process.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from typing import Any
-
-
-def access_process(dataset: Any) -> Any:
- """
- Process a dataset to determine the access process for each property based on
- city ownership and market value. The result is added as a new column in the dataset.
-
- Args:
- dataset (Any): The dataset containing a GeoDataFrame named `gdf` with
- columns "city_owner_agency" and "market_value".
-
- Returns:
- Any: The updated dataset with an additional "access_process" column.
-
- Side Effects:
- Prints the distribution of the "access_process" column.
- """
- access_processes = []
-
- for _, row in dataset.gdf.iterrows():
- # Decision Points
- city_owner_agency = row["city_owner_agency"]
- market_value_over_1000 = (
- row["market_value"] and float(row["market_value"]) > 1000
- )
-
- # Simplified decision logic
- if city_owner_agency == "Land Bank (PHDC)":
- access_process = "Go through Land Bank"
- elif city_owner_agency == "PRA":
- access_process = "Do Nothing"
- else:
- if market_value_over_1000:
- access_process = "Private Land Use Agreement"
- else:
- access_process = "Buy Property"
-
- access_processes.append(access_process)
-
- dataset.gdf["access_process"] = access_processes
-
- return dataset
diff --git a/data/src/data_utils/city_owned_properties.py b/data/src/data_utils/city_owned_properties.py
deleted file mode 100644
index a5b21980..00000000
--- a/data/src/data_utils/city_owned_properties.py
+++ /dev/null
@@ -1,81 +0,0 @@
-from typing import Any
-from classes.featurelayer import FeatureLayer
-from constants.services import CITY_OWNED_PROPERTIES_TO_LOAD
-
-def city_owned_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
- """
- Processes city-owned property data by joining it with the primary feature layer,
- renaming columns, and updating access information for properties based on ownership.
- All instances where the "city_owner_agency" is "PLB" are changed to "Land Bank (PHDC)".
-
- Args:
- primary_featurelayer (FeatureLayer): The primary feature layer to which city-owned
- property data will be joined.
-
- Returns:
- FeatureLayer: The updated primary feature layer with processed city ownership
- information.
- """
- city_owned_properties = FeatureLayer(
- name="City Owned Properties",
- esri_rest_urls=CITY_OWNED_PROPERTIES_TO_LOAD,
- cols=["OPABRT", "AGENCY", "SIDEYARDELIGIBLE"],
- )
-
- city_owned_properties.gdf.dropna(subset=["opabrt"], inplace=True)
-
- primary_featurelayer.opa_join(city_owned_properties.gdf, "opabrt")
-
- rename_columns = {
- "agency": "city_owner_agency",
- "sideyardeligible": "side_yard_eligible",
- }
- primary_featurelayer.gdf.rename(columns=rename_columns, inplace=True)
-
- primary_featurelayer.gdf.loc[
- primary_featurelayer.gdf["owner_1"].isin(
- [
- "PHILADELPHIA HOUSING AUTH",
- "PHILADELPHIA LAND BANK",
- "REDEVELOPMENT AUTHORITY",
- "PHILA REDEVELOPMENT AUTH",
- ]
- ),
- "city_owner_agency",
- ] = primary_featurelayer.gdf["owner_1"].replace(
- {
- "PHILADELPHIA HOUSING AUTH": "PHA",
- "PHILADELPHIA LAND BANK": "Land Bank (PHDC)",
- "REDEVELOPMENT AUTHORITY": "PRA",
- "PHILA REDEVELOPMENT AUTH": "PRA",
- }
- )
-
- primary_featurelayer.gdf.loc[
- (primary_featurelayer.gdf["owner_1"] == "CITY OF PHILA")
- & (
- primary_featurelayer.gdf["owner_2"].str.contains(
- "PUBLIC PROP|PUBLC PROP", na=False
- )
- ),
- "city_owner_agency",
- ] = "DPP"
-
- primary_featurelayer.gdf.loc[
- primary_featurelayer.gdf["owner_1"].isin(
- ["CITY OF PHILADELPHIA", "CITY OF PHILA"]
- )
- & primary_featurelayer.gdf["owner_2"].isna(),
- "city_owner_agency",
- ] = "City of Philadelphia"
-
- primary_featurelayer.gdf.loc[:, "side_yard_eligible"] = primary_featurelayer.gdf[
- "side_yard_eligible"
- ].fillna("No")
-
- # Update all instances where city_owner_agency is "PLB" to "Land Bank (PHDC)"
- primary_featurelayer.gdf.loc[
- primary_featurelayer.gdf["city_owner_agency"] == "PLB", "city_owner_agency"
- ] = "Land Bank (PHDC)"
-
- return primary_featurelayer
diff --git a/data/src/data_utils/community_gardens.py b/data/src/data_utils/community_gardens.py
deleted file mode 100644
index 4bed0284..00000000
--- a/data/src/data_utils/community_gardens.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from classes.featurelayer import FeatureLayer
-from constants.services import COMMUNITY_GARDENS_TO_LOAD
-
-
-def community_gardens(primary_featurelayer):
- # this script *removes* (rather than adds) known community gardens from the dataset in order to protect them from potential predatory developers
- community_gardens = FeatureLayer(
- name="Community Gardens", esri_rest_urls=COMMUNITY_GARDENS_TO_LOAD
- )
-
- community_gardens.gdf = community_gardens.gdf[["Site_Name", "geometry"]]
-
- primary_featurelayer.spatial_join(community_gardens)
-
- # Create a boolean mask where 'site_Name' is not null
- mask = primary_featurelayer.gdf["Site_Name"].notnull()
-
- count_dropped = mask.sum()
- print(f"Number of community gardens being dropped: {count_dropped}")
-
- # Use this mask to drop rows where 'site_Name' is not null
- primary_featurelayer.gdf = primary_featurelayer.gdf.drop(
- primary_featurelayer.gdf[mask].index
- )
-
- primary_featurelayer.gdf = primary_featurelayer.gdf.drop(columns=["Site_Name"])
-
- return primary_featurelayer
diff --git a/data/src/data_utils/conservatorship.py b/data/src/data_utils/conservatorship.py
deleted file mode 100644
index 5f9c9793..00000000
--- a/data/src/data_utils/conservatorship.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import datetime
-from dateutil.parser import parse
-import pytz
-
-est = pytz.timezone("US/Eastern")
-six_months_ago = (datetime.datetime.now() - datetime.timedelta(days=180)).astimezone(
- est
-)
-
-blight_words = [
- "weed",
- "rubbish",
- "garbage",
- "tire",
- "debris",
- "clean",
- "waste",
- "vegetation",
- "dumping",
- "scrap",
- "auto",
- "vehicle",
- "graffiti",
- "dangerous",
-]
-
-
-def conservatorship(primary_featurelayer):
- conservatorships = []
-
- for idx, row in primary_featurelayer.gdf.iterrows():
- city_owner_agency = row["city_owner_agency"]
- sheriff_sale = row["sheriff_sale"] == "Y"
- market_value_over_1000 = (
- row["market_value"] and float(row["market_value"]) > 1000
- )
- li_complaints_lower = str(row["li_complaints"]).lower().split(" ")
- contains_blight_word = any(word in li_complaints_lower for word in blight_words)
-
- try:
- sale_date = parse(row["sale_date"]).astimezone(est)
- sale_date_6_months_ago = sale_date <= six_months_ago
- except (TypeError, ValueError):
- sale_date_6_months_ago = False
-
- # Simplified decision logic
- if city_owner_agency == "Land Bank (PHDC)" or (
- not sale_date_6_months_ago and market_value_over_1000
- ):
- conservatorship = "No"
- elif contains_blight_word and not sheriff_sale and sale_date_6_months_ago:
- conservatorship = "Yes"
- else:
- conservatorship = "No"
-
- conservatorships.append(conservatorship)
-
- primary_featurelayer.gdf["conservatorship"] = conservatorships
- return primary_featurelayer
diff --git a/data/src/data_utils/contig_neighbors.py b/data/src/data_utils/contig_neighbors.py
deleted file mode 100644
index 1c811a15..00000000
--- a/data/src/data_utils/contig_neighbors.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import warnings
-
-import networkx as nx
-from libpysal.weights import Queen
-
-
-def contig_neighbors(primary_featurelayer):
- parcels = primary_featurelayer.gdf
-
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=FutureWarning)
- warnings.filterwarnings(
- "ignore",
- category=UserWarning,
- message="The weights matrix is not fully connected",
- )
-
- w = Queen.from_dataframe(parcels)
-
- g = w.to_networkx()
-
- # Calculate the number of contiguous neighbors for each feature in parcels
- n_contiguous = [len(nx.node_connected_component(g, i)) for i in range(len(parcels))]
-
- primary_featurelayer.gdf["n_contiguous"] = n_contiguous
-
- return primary_featurelayer
diff --git a/data/src/data_utils/deliquencies.py b/data/src/data_utils/deliquencies.py
deleted file mode 100644
index 16f8f205..00000000
--- a/data/src/data_utils/deliquencies.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from classes.featurelayer import FeatureLayer
-from constants.services import DELINQUENCIES_QUERY
-
-
-def deliquencies(primary_featurelayer):
- tax_deliquencies = FeatureLayer(
- name="Property Tax Delinquencies",
- carto_sql_queries=DELINQUENCIES_QUERY,
- use_wkb_geom_field="the_geom",
- cols=[
- "opa_number",
- "total_due",
- "is_actionable",
- "payment_agreement",
- "num_years_owed",
- "most_recent_year_owed",
- "total_assessment",
- "sheriff_sale",
- ],
- )
-
- primary_featurelayer.opa_join(
- tax_deliquencies.gdf,
- "opa_number",
- )
-
- primary_featurelayer.gdf.loc[:, "sheriff_sale"] = primary_featurelayer.gdf[
- "sheriff_sale"
- ].fillna("N")
-
- return primary_featurelayer
diff --git a/data/src/data_utils/dev_probability.py b/data/src/data_utils/dev_probability.py
deleted file mode 100644
index 4c8a220f..00000000
--- a/data/src/data_utils/dev_probability.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import geopandas as gpd
-import jenkspy
-import pandas as pd
-import requests
-from classes.featurelayer import FeatureLayer
-from constants.services import CENSUS_BGS_URL, PERMITS_QUERY
-
-from config.config import USE_CRS
-
-
-def dev_probability(primary_featurelayer):
- census_bgs_gdf = gpd.read_file(CENSUS_BGS_URL)
- census_bgs_gdf = census_bgs_gdf.to_crs(USE_CRS)
-
- base_url = "https://phl.carto.com/api/v2/sql"
- response = requests.get(f"{base_url}?q={PERMITS_QUERY}&format=GeoJSON")
-
- if response.status_code == 200:
- try:
- permits_gdf = gpd.GeoDataFrame.from_features(
- response.json(), crs="EPSG:4326"
- )
- print("GeoDataFrame created successfully.")
- except Exception as e:
- print(f"Failed to convert response to GeoDataFrame: {e}")
- return primary_featurelayer
- else:
- truncated_response = response.content[:500]
- print(
- f"Failed to fetch permits data. HTTP status code: {response.status_code}. Response text: {truncated_response}"
- )
- return primary_featurelayer
-
- permits_gdf = permits_gdf.to_crs(USE_CRS)
-
- joined_gdf = gpd.sjoin(permits_gdf, census_bgs_gdf, how="inner", predicate="within")
-
- permit_counts = joined_gdf.groupby("index_right").size()
- census_bgs_gdf["permit_count"] = census_bgs_gdf.index.map(permit_counts)
- census_bgs_gdf["permit_count"] = census_bgs_gdf["permit_count"].fillna(0)
-
- # Classify development probability using Jenks natural breaks
- breaks = jenkspy.jenks_breaks(census_bgs_gdf["permit_count"], n_classes=3)
- census_bgs_gdf["dev_rank"] = pd.cut(
- census_bgs_gdf["permit_count"], bins=breaks, labels=["Low", "Medium", "High"]
- ).astype(str)
-
- updated_census_bgs = FeatureLayer(
- name="Updated Census Block Groups",
- gdf=census_bgs_gdf[["permit_count", "dev_rank", "geometry"]],
- use_wkb_geom_field="geometry",
- cols=["permit_count", "dev_rank"],
- )
-
- updated_census_bgs.gdf = updated_census_bgs.gdf.to_crs(USE_CRS)
-
- primary_featurelayer.spatial_join(updated_census_bgs)
-
- return primary_featurelayer
diff --git a/data/src/data_utils/drug_crimes.py b/data/src/data_utils/drug_crimes.py
deleted file mode 100644
index 61a4a43c..00000000
--- a/data/src/data_utils/drug_crimes.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from constants.services import DRUGCRIME_SQL_QUERY
-
-
-from data_utils.kde import apply_kde_to_primary
-
-
-def drug_crimes(primary_featurelayer):
- return apply_kde_to_primary(
- primary_featurelayer, "Drug Crimes", DRUGCRIME_SQL_QUERY
- )
diff --git a/data/src/data_utils/gun_crimes.py b/data/src/data_utils/gun_crimes.py
deleted file mode 100644
index 27155546..00000000
--- a/data/src/data_utils/gun_crimes.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from constants.services import GUNCRIME_SQL_QUERY
-
-
-from data_utils.kde import apply_kde_to_primary
-
-
-def gun_crimes(primary_featurelayer):
- return apply_kde_to_primary(primary_featurelayer, "Gun Crimes", GUNCRIME_SQL_QUERY)
diff --git a/data/src/data_utils/imm_dang_buildings.py b/data/src/data_utils/imm_dang_buildings.py
deleted file mode 100644
index 7e7041ba..00000000
--- a/data/src/data_utils/imm_dang_buildings.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from classes.featurelayer import FeatureLayer
-from constants.services import IMMINENT_DANGER_BUILDINGS_QUERY
-
-
-def imm_dang_buildings(primary_featurelayer):
- imm_dang_buildings = FeatureLayer(
- name="Imminently Dangerous Buildings",
- use_wkb_geom_field="the_geom",
- carto_sql_queries=IMMINENT_DANGER_BUILDINGS_QUERY,
- cols=["opa_account_num"],
- )
-
- imm_dang_buildings.gdf.loc[:, "imm_dang_building"] = "Y"
-
- imm_dang_buildings.gdf = imm_dang_buildings.gdf.rename(
- columns={"opa_account_num": "opa_number"}
- )
-
- primary_featurelayer.opa_join(
- imm_dang_buildings.gdf,
- "opa_number",
- )
-
- primary_featurelayer.gdf.loc[:, "imm_dang_building"] = primary_featurelayer.gdf[
- "imm_dang_building"
- ].fillna("N")
-
- return primary_featurelayer
diff --git a/data/src/data_utils/kde.py b/data/src/data_utils/kde.py
deleted file mode 100644
index 477cc306..00000000
--- a/data/src/data_utils/kde.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-import rasterio
-from awkde.awkde import GaussianKDE
-from classes.featurelayer import FeatureLayer
-from config.config import USE_CRS
-from rasterio.transform import Affine
-from tqdm import tqdm
-from concurrent.futures import ProcessPoolExecutor, as_completed
-
-import mapclassify
-
-resolution = 1320 # 0.25 miles (in feet, bc the CRS is 2272)
-batch_size = 100000
-
-
-def kde_predict_chunk(kde, chunk):
- """Helper function to predict KDE for a chunk of grid points."""
- return kde.predict(chunk)
-
-
-def generic_kde(name, query, resolution=resolution, batch_size=batch_size):
- print(f"Initializing FeatureLayer for {name}")
-
- feature_layer = FeatureLayer(name=name, carto_sql_queries=query)
-
- coords = np.array([geom.xy for geom in feature_layer.gdf.geometry])
- x, y = coords[:, 0, :].flatten(), coords[:, 1, :].flatten()
-
- X = np.column_stack((x, y))
-
- x_grid, y_grid = (
- np.linspace(x.min(), x.max(), resolution),
- np.linspace(y.min(), y.max(), resolution),
- )
- xx, yy = np.meshgrid(x_grid, y_grid)
- grid_points = np.column_stack((xx.ravel(), yy.ravel()))
-
- print(f"Fitting KDE for {name} data")
- kde = GaussianKDE(glob_bw=0.1, alpha=0.999, diag_cov=True)
- kde.fit(X)
-
- print(f"Predicting KDE values for grid of size {grid_points.shape}")
-
- # Split grid points into chunks
- chunks = [
- grid_points[i : i + batch_size] for i in range(0, len(grid_points), batch_size)
- ]
-
- # Run predictions in parallel
- z = np.zeros(len(grid_points)) # Placeholder for predicted values
-
- with ProcessPoolExecutor() as executor:
- # Submit the tasks first, wrapped with tqdm to monitor as they're submitted
- futures = {
- executor.submit(kde_predict_chunk, kde, chunk): i
- for i, chunk in enumerate(tqdm(chunks, desc="Submitting tasks"))
- }
-
- # Now wrap the as_completed with tqdm for progress tracking
- for future in tqdm(
- as_completed(futures), total=len(futures), desc="Processing tasks"
- ):
- i = futures[future]
- z[i * batch_size : (i + 1) * batch_size] = future.result()
-
- zz = z.reshape(xx.shape)
-
- x_res, y_res = (
- (x.max() - x.min()) / (resolution - 1),
- (y.max() - y.min()) / (resolution - 1),
- )
- min_x, min_y = x.min(), y.min()
-
- transform = Affine.translation(min_x, min_y) * Affine.scale(x_res, y_res)
-
- raster_filename = f"tmp/{name.lower().replace(' ', '_')}.tif"
- print(f"Saving raster to {raster_filename}")
-
- with rasterio.open(
- raster_filename,
- "w",
- driver="GTiff",
- height=zz.shape[0],
- width=zz.shape[1],
- count=1,
- dtype=zz.dtype,
- crs=USE_CRS,
- transform=transform,
- ) as dst:
- dst.write(zz, 1)
-
- return raster_filename, X
-
-
-def apply_kde_to_primary(primary_featurelayer, name, query, resolution=resolution):
- # Generate KDE and raster file
- raster_filename, crime_coords = generic_kde(name, query, resolution)
-
- # Add centroid column temporarily
- primary_featurelayer.gdf["centroid"] = primary_featurelayer.gdf.geometry.centroid
-
- # Create list of (x, y) coordinates for centroids
- coord_list = [
- (x, y)
- for x, y in zip(
- primary_featurelayer.gdf["centroid"].x,
- primary_featurelayer.gdf["centroid"].y,
- )
- ]
-
- # Remove the temporary centroid column
- primary_featurelayer.gdf = primary_featurelayer.gdf.drop(columns=["centroid"])
-
- # Open the generated raster file and sample the KDE density values at the centroids
- with rasterio.open(raster_filename) as src:
- sampled_values = [x[0] for x in src.sample(coord_list)]
-
- # Create a column for the density values
- density_column = f"{name.lower().replace(' ', '_')}_density"
- primary_featurelayer.gdf[density_column] = sampled_values
-
- # Calculate percentiles using mapclassify.Percentiles
- percentile_breaks = list(range(101)) # Percentile breaks from 0 to 100
- classifier = mapclassify.Percentiles(
- primary_featurelayer.gdf[density_column], pct=percentile_breaks
- )
-
- # Assign the percentile bins to the density values
- primary_featurelayer.gdf[density_column + "_percentile"] = (
- classifier.yb
- ) # yb gives the bin index
-
- # Apply percentile labels (e.g., 1st Percentile, 2nd Percentile, etc.)
- primary_featurelayer.gdf[density_column + "_label"] = primary_featurelayer.gdf[
- density_column + "_percentile"
- ].apply(label_percentile)
-
- # Convert the percentile column to float and drop the density column
- primary_featurelayer.gdf[density_column + "_percentile"] = primary_featurelayer.gdf[
- density_column + "_percentile"
- ].astype(float)
-
- primary_featurelayer.gdf = primary_featurelayer.gdf.drop(columns=[density_column])
-
- print(f"Finished processing {name}")
- return primary_featurelayer
-
-
-def label_percentile(value):
- if 10 <= value % 100 <= 13:
- return f"{value}th Percentile"
- elif value % 10 == 1:
- return f"{value}st Percentile"
- elif value % 10 == 2:
- return f"{value}nd Percentile"
- elif value % 10 == 3:
- return f"{value}rd Percentile"
- else:
- return f"{value}th Percentile"
diff --git a/data/src/data_utils/l_and_i.py b/data/src/data_utils/l_and_i.py
deleted file mode 100644
index 27f28147..00000000
--- a/data/src/data_utils/l_and_i.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import pandas as pd
-import geopandas as gpd
-from typing import List
-from classes.featurelayer import FeatureLayer
-from constants.services import COMPLAINTS_SQL_QUERY, VIOLATIONS_SQL_QUERY
-
-def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
- """
- Process L&I (Licenses and Inspections) data for complaints and violations.
-
- This function filters and processes L&I complaints and violations data,
- joining it with the primary feature layer based on spatial relationships
- and OPA (Office of Property Assessment) identifiers.
-
- Args:
- primary_featurelayer (FeatureLayer): The primary feature layer to join L&I data to.
-
- Returns:
- FeatureLayer: The primary feature layer updated with L&I data.
- """
- keywords: List[str] = [
- 'dumping', 'blight', 'rubbish', 'weeds', 'graffiti',
- 'abandoned', 'sanitation', 'litter', 'vacant', 'trash',
- 'unsafe'
- ]
-
- # Load complaints data from L&I
- l_and_i_complaints: FeatureLayer = FeatureLayer(
- name="LI Complaints",
- carto_sql_queries=COMPLAINTS_SQL_QUERY
- )
-
- # Filter for rows where 'subject' contains any of the keywords
- l_and_i_complaints.gdf = l_and_i_complaints.gdf[
- l_and_i_complaints.gdf["subject"].str.lower().str.contains('|'.join(keywords))
- ]
-
- # Filter for only Status = 'Open'
- l_and_i_complaints.gdf = l_and_i_complaints.gdf[
- l_and_i_complaints.gdf["status"].str.lower() == "open"
- ]
-
- # Group by geometry and concatenate the violationcodetitle values into a list with a semicolon separator
- l_and_i_complaints.gdf = (
- l_and_i_complaints.gdf.groupby("geometry")["service_name"]
- .apply(lambda x: "; ".join([val for val in x if val is not None]))
- .reset_index()
- )
-
- l_and_i_complaints.rebuild_gdf()
-
- # rename the column to 'li_complaints'
- l_and_i_complaints.gdf.rename(
- columns={"service_name": "li_complaints"}, inplace=True
- )
-
- # Load data for violations from L&I
- l_and_i_violations: FeatureLayer = FeatureLayer(
- name="LI Violations",
- carto_sql_queries=VIOLATIONS_SQL_QUERY,
- from_xy=True
- )
-
- # Filter for rows where 'casetype' contains any of the keywords, handling NaN values
- l_and_i_violations.gdf = l_and_i_violations.gdf[
- l_and_i_violations.gdf["violationcodetitle"].fillna('').str.lower().str.contains('|'.join(keywords))
- ]
-
- all_violations_count_df: pd.DataFrame = (
- l_and_i_violations.gdf.groupby("opa_account_num")
- .count()
- .reset_index()[["opa_account_num", "violationnumber", "geometry"]]
- )
- all_violations_count_df = all_violations_count_df.rename(
- columns={"violationnumber": "all_violations_past_year"}
- )
- # filter for only cases where the casestatus is 'IN VIOLATION' or 'UNDER INVESTIGATION'
- violations_gdf: gpd.GeoDataFrame = l_and_i_violations.gdf[
- (l_and_i_violations.gdf["violationstatus"].str.lower() == "open")
- ]
-
- open_violations_count_df: pd.DataFrame = (
- violations_gdf.groupby("opa_account_num")
- .count()
- .reset_index()[["opa_account_num", "violationnumber", "geometry"]]
- )
- open_violations_count_df = open_violations_count_df.rename(
- columns={"violationnumber": "open_violations_past_year"}
- )
- # join the all_violations_count_df and open_violations_count_df dataframes on opa_account_num
- violations_count_gdf: gpd.GeoDataFrame = all_violations_count_df.merge(
- open_violations_count_df, how="left", on="opa_account_num"
- )
-
- # replace NaN values with 0
- violations_count_gdf.fillna(0, inplace=True)
-
- # convert the all_violations_past_year and open_violations_past_year columns to integers
- violations_count_gdf["all_violations_past_year"] = violations_count_gdf[
- "all_violations_past_year"
- ].astype(int)
- violations_count_gdf["open_violations_past_year"] = violations_count_gdf[
- "open_violations_past_year"
- ].astype(int)
- violations_count_gdf = violations_count_gdf[
- ["opa_account_num", "all_violations_past_year", "open_violations_past_year"]
- ]
-
- # collapse violations_gdf by address and concatenate the violationcodetitle values into a list with a semicolon separator
- l_and_i_violations.gdf = (
- l_and_i_violations.gdf.groupby("geometry")["violationcodetitle"]
- .apply(lambda x: "; ".join([val for val in x if val is not None]))
- .reset_index()
- )
- l_and_i_complaints.rebuild_gdf()
-
- # rename the column to 'li_violations'
- l_and_i_violations.gdf.rename(
- columns={"violationcodetitle": "li_code_violations"}, inplace=True
- )
-
- # Violations can work with an OPA join
- primary_featurelayer.opa_join(
- violations_count_gdf,
- "opa_account_num",
- )
-
- # Complaints need a spatial join, but we need to take special care to merge on just the parcel geoms first to get opa_id
- complaints_with_opa_id: gpd.GeoDataFrame = primary_featurelayer.gdf.sjoin(
- l_and_i_complaints.gdf, how="left", predicate="contains"
- )
- complaints_with_opa_id.drop(columns=["index_right"], inplace=True)
-
- # Concatenate the complaints values into a list with a semicolon separator by opa_id
- complaints_with_opa_id = (
- complaints_with_opa_id.groupby("opa_id")["li_complaints"]
- .apply(lambda x: "; ".join([str(val) for val in x if val is not None]))
- .reset_index()[["opa_id", "li_complaints"]]
- )
-
- # Clean up the NaN values in the li_complaints column
- def remove_nan_strings(x: str) -> str | None:
- """
- Remove 'nan' strings from the input.
-
- Args:
- x (str): Input string.
-
- Returns:
- str | None: Cleaned string or None if only 'nan' values.
- """
- if x == "nan" or ("nan;" in x):
- return None
- else:
- return x
-
- complaints_with_opa_id["li_complaints"] = complaints_with_opa_id[
- "li_complaints"
- ].apply(remove_nan_strings)
-
- # Merge the complaints values back into the primary_featurelayer
- primary_featurelayer.opa_join(
- complaints_with_opa_id,
- "opa_id",
- )
-
- primary_featurelayer.gdf[
- ["all_violations_past_year", "open_violations_past_year"]
- ] = (
- primary_featurelayer.gdf[
- ["all_violations_past_year", "open_violations_past_year"]
- ]
- .apply(lambda x: pd.to_numeric(x, errors="coerce"))
- .fillna(0)
- .astype(int)
- )
-
- return primary_featurelayer
\ No newline at end of file
diff --git a/data/src/data_utils/nbhoods.py b/data/src/data_utils/nbhoods.py
deleted file mode 100644
index 6fde4bd0..00000000
--- a/data/src/data_utils/nbhoods.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import geopandas as gpd
-from classes.featurelayer import FeatureLayer
-from constants.services import NBHOODS_URL
-
-from config.config import USE_CRS
-
-
-def nbhoods(primary_featurelayer):
- phl_nbhoods = gpd.read_file(NBHOODS_URL)
-
- # Correct the column name to uppercase if needed
- if 'MAPNAME' in phl_nbhoods.columns:
- phl_nbhoods.rename(columns={"MAPNAME": "neighborhood"}, inplace=True)
-
- phl_nbhoods = phl_nbhoods.to_crs(USE_CRS)
-
- nbhoods = FeatureLayer("Neighborhoods")
- nbhoods.gdf = phl_nbhoods
-
- red_cols_to_keep = ["neighborhood", "geometry"]
- nbhoods.gdf = nbhoods.gdf[red_cols_to_keep]
-
- primary_featurelayer.spatial_join(nbhoods)
-
- return primary_featurelayer
diff --git a/data/src/data_utils/negligent_devs.py b/data/src/data_utils/negligent_devs.py
deleted file mode 100644
index aa95532c..00000000
--- a/data/src/data_utils/negligent_devs.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import re
-
-import pandas as pd
-
-replacements = {
- "STREET": "ST",
- "AVENUE": "AVE",
- "ROAD": "RD",
- "BOULEVARD": "BLVD",
- "PLACE": "PL",
- "FLOOR": "FL",
- "FLR": "FL",
- "FIRST": "1ST",
- "SECOND": "2ND",
- "THIRD": "3RD",
- "FOURTH": "4TH",
- "FIFTH": "5TH",
- "SIXTH": "6TH",
- "SEVENTH": "7TH",
- "EIGHTH": "8TH",
- "NINTH": "9TH",
- "NORTH": "N",
- "SOUTH": "S",
- "EAST": "E",
- "WEST": "W",
- "SUITE": "STE",
- "LA": "LN",
- "LANE": "LN",
- "PARKWAY": "PKY",
-}
-
-
-def standardize_street(street):
- if not isinstance(street, str):
- return ""
- for full, abbr in replacements.items():
- street = re.sub(r"\b{}\b".format(full), abbr, street, flags=re.IGNORECASE)
- return street
-
-
-def create_standardized_address(row):
- parts = [
- row["mailing_address_1"].strip()
- if pd.notnull(row["mailing_address_1"])
- else "",
- row["mailing_address_2"].strip()
- if pd.notnull(row["mailing_address_2"])
- else "",
- row["mailing_street"].strip() if pd.notnull(row["mailing_street"]) else "",
- row["mailing_city_state"].strip()
- if pd.notnull(row["mailing_city_state"])
- else "",
- row["mailing_zip"].strip() if pd.notnull(row["mailing_zip"]) else "",
- ]
- standardized_address = ", ".join([part for part in parts if part])
- return standardized_address.lower()
-
-
-def negligent_devs(primary_featurelayer):
- devs = primary_featurelayer.gdf
-
- print("Columns in 'devs' DataFrame:", devs.columns)
-
- print("Initial properties data:")
- print(devs[['opa_id', 'city_owner_agency', 'mailing_street']].head(10))
-
- city_owners = devs.loc[~devs["city_owner_agency"].isna() & (devs["city_owner_agency"] != "")].copy()
- non_city_owners = devs.loc[devs["city_owner_agency"].isna() | (devs["city_owner_agency"] == "")].copy()
-
- print(f"City owners shape: {city_owners.shape}, Non-city owners shape: {non_city_owners.shape}")
-
- # Log before standardizing addresses
- print("Non-city owners mailing streets before standardization:")
- print(non_city_owners[['opa_id', 'mailing_street']].head(10))
-
- non_city_owners.loc[:, "mailing_street"] = (
- non_city_owners["mailing_street"].astype(str).apply(standardize_street)
- )
-
- print("Non-city owners mailing streets after standardization:")
- print(non_city_owners[['opa_id', 'mailing_street']].head(10))
-
- for term in ["ST", "AVE", "RD", "BLVD"]:
- non_city_owners.loc[:, "mailing_street"] = non_city_owners[
- "mailing_street"
- ].replace(regex={f"{term}.*": term})
-
- # Log after applying term replacement
- print("Non-city owners mailing streets after term replacement:")
- print(non_city_owners[['opa_id', 'mailing_street']].head(10))
-
- # Fill missing address components
- non_city_owners.loc[:, "mailing_address_1"] = non_city_owners[
- "mailing_address_1"
- ].fillna("")
- non_city_owners.loc[:, "mailing_address_2"] = non_city_owners[
- "mailing_address_2"
- ].fillna("")
- non_city_owners.loc[:, "mailing_street"] = non_city_owners["mailing_street"].fillna(
- ""
- )
- non_city_owners.loc[:, "mailing_city_state"] = non_city_owners[
- "mailing_city_state"
- ].fillna("")
- non_city_owners.loc[:, "mailing_zip"] = non_city_owners["mailing_zip"].fillna("")
-
- # Log addresses before creating standardized address
- print("Non-city owners mailing details before creating standardized address:")
- print(non_city_owners[['opa_id', 'mailing_street', 'mailing_city_state', 'mailing_zip']].head(10))
-
- non_city_owners.loc[:, "standardized_address"] = non_city_owners.apply(
- create_standardized_address, axis=1
- )
-
- # Log standardized addresses and counts
- print("Standardized addresses with counts:")
- address_counts = (
- non_city_owners.groupby("standardized_address")
- .size()
- .reset_index(name="property_count")
- )
- print(address_counts.head(10))
-
- sorted_address_counts = address_counts.sort_values(
- by="property_count", ascending=False
- )
- print("Top standardized addresses by property count:")
- print(sorted_address_counts.head(10))
-
- non_city_owners = non_city_owners.merge(
- sorted_address_counts, on="standardized_address", how="left"
- )
-
- # Log merged data for city owners
- city_owner_counts = (
- city_owners.groupby("city_owner_agency")
- .size()
- .reset_index(name="property_count")
- )
- print("City owner counts:")
- print(city_owner_counts.head(10))
-
- city_owners = city_owners.merge(
- city_owner_counts, on="city_owner_agency", how="left"
- )
-
- devs_combined = pd.concat([city_owners, non_city_owners], axis=0)
-
- # Final check on the merged data before updating primary_featurelayer
- print("Combined data with property counts:")
- print(devs_combined[['opa_id', 'property_count']].head(10))
-
- primary_featurelayer.gdf = primary_featurelayer.gdf.merge(
- devs_combined[["opa_id", "property_count"]], on="opa_id", how="left"
- )
- primary_featurelayer.gdf.rename(
- columns={"property_count": "n_properties_owned"}, inplace=True
- )
- primary_featurelayer.gdf.loc[:, "negligent_dev"] = (
- primary_featurelayer.gdf["n_properties_owned"] > 5
- ) & (primary_featurelayer.gdf["city_owner_agency"].isna() | (primary_featurelayer.gdf["city_owner_agency"] == ""))
-
- print("Final feature layer data with negligent_dev flag:")
- print(primary_featurelayer.gdf[['opa_id', 'n_properties_owned', 'negligent_dev']].head(10))
-
- return primary_featurelayer
diff --git a/data/src/data_utils/opa_properties.py b/data/src/data_utils/opa_properties.py
deleted file mode 100644
index 2d02f42f..00000000
--- a/data/src/data_utils/opa_properties.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from classes.featurelayer import FeatureLayer
-from constants.services import OPA_PROPERTIES_QUERY
-
-
-def opa_properties(primary_featurelayer):
- opa = FeatureLayer(
- name="OPA Properties",
- carto_sql_queries=OPA_PROPERTIES_QUERY,
- use_wkb_geom_field="the_geom",
- cols=[
- "market_value",
- "sale_date",
- "sale_price",
- "parcel_number",
- "mailing_address_1",
- "mailing_address_2",
- "mailing_care_of",
- "mailing_city_state",
- "mailing_street",
- "mailing_zip"
- ]
- )
-
- primary_featurelayer.opa_join(
- opa.gdf,
- "parcel_number",
- )
-
- return primary_featurelayer
diff --git a/data/src/data_utils/owner_type.py b/data/src/data_utils/owner_type.py
deleted file mode 100644
index 291364df..00000000
--- a/data/src/data_utils/owner_type.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import pandas as pd
-from classes.featurelayer import FeatureLayer
-
-def owner_type(primary_featurelayer: FeatureLayer) -> FeatureLayer:
- """
- Determines the ownership type for each property in the primary feature layer based on
- the 'owner_1', 'owner_2', and 'city_owner_agency' columns. The ownership type is set as:
- - "Public" if 'city_owner_agency' is not NA.
- - "Business (LLC)" if 'city_owner_agency' is NA and "LLC" is found in 'owner_1' or 'owner_2'.
- - "Individual" if 'city_owner_agency' is NA and "LLC" is not found in 'owner_1' or 'owner_2'.
-
- Args:
- primary_featurelayer (FeatureLayer): The feature layer containing property ownership data.
-
- Returns:
- FeatureLayer: The updated feature layer with the 'owner_type' column added.
- """
- owner_types = []
-
- for _, row in primary_featurelayer.gdf.iterrows():
- # Extract owner1, owner2, and city_owner_agency
- owner1 = str(row["owner_1"]).lower()
- owner2 = str(row["owner_2"]).lower()
- city_owner_agency = row["city_owner_agency"]
-
- # Determine ownership type based on the conditions
- if pd.notna(city_owner_agency):
- owner_types.append("Public")
- elif " llc" in owner1 or " llc" in owner2:
- owner_types.append("Business (LLC)")
- else:
- owner_types.append("Individual")
-
- # Add the 'owner_type' column to the GeoDataFrame
- primary_featurelayer.gdf["owner_type"] = owner_types
-
- return primary_featurelayer
diff --git a/data/src/data_utils/park_priority.py b/data/src/data_utils/park_priority.py
deleted file mode 100644
index 7a97fb3b..00000000
--- a/data/src/data_utils/park_priority.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import os
-import zipfile
-from io import BytesIO
-from typing import List, Union
-
-import geopandas as gpd
-import requests
-from bs4 import BeautifulSoup
-from classes.featurelayer import FeatureLayer
-from config.config import USE_CRS
-from tqdm import tqdm
-import pyogrio
-
-
-def get_latest_shapefile_url() -> str:
- """
- Scrapes the TPL website to get the URL of the latest shapefile.
-
- Returns:
- str: The URL of the latest shapefile.
-
- Raises:
- ValueError: If the shapefile link is not found on the page.
- """
- url: str = "https://www.tpl.org/park-data-downloads"
- response: requests.Response = requests.get(url)
- soup: BeautifulSoup = BeautifulSoup(response.content, "html.parser")
-
- shapefile_link: Union[BeautifulSoup, None] = soup.find("a", string="Shapefile")
- if shapefile_link:
- return str(shapefile_link["href"])
- else:
- raise ValueError("Shapefile link not found on the page")
-
-
-def download_and_process_shapefile(
- geojson_path: str, park_url: str, target_files: List[str], file_name_prefix: str
-) -> gpd.GeoDataFrame:
- """
- Downloads and processes the shapefile to create a GeoDataFrame for Philadelphia parks.
-
- Args:
- geojson_path (str): Path to save the GeoJSON file.
- park_url (str): URL to download the shapefile.
- target_files (List[str]): List of files to extract from the shapefile.
- file_name_prefix (str): Prefix for the file names to be extracted.
-
- Returns:
- gpd.GeoDataFrame: GeoDataFrame containing the processed park data.
- """
- print("Downloading and processing park priority data...")
- response: requests.Response = requests.get(park_url, stream=True)
- total_size: int = int(response.headers.get("content-length", 0))
-
- with tqdm(
- total=total_size, unit="iB", unit_scale=True, desc="Downloading"
- ) as progress_bar:
- buffer: BytesIO = BytesIO()
- for data in response.iter_content(1024):
- size: int = buffer.write(data)
- progress_bar.update(size)
-
- with zipfile.ZipFile(buffer) as zip_ref:
- for file_name in tqdm(target_files, desc="Extracting"):
- zip_ref.extract(file_name, "tmp/")
-
- print("Processing shapefile...")
- pa_parks: gpd.GeoDataFrame = gpd.read_file(
- "tmp/" + file_name_prefix + "_ParkPriorityAreas.shp"
- )
- pa_parks = pa_parks.to_crs(USE_CRS)
-
- phl_parks: gpd.GeoDataFrame = pa_parks[pa_parks["ID"].str.startswith("42101")]
- phl_parks = phl_parks.loc[:, ["ParkNeed", "geometry"]]
-
- if isinstance(phl_parks, gpd.GeoDataFrame):
- phl_parks.rename(columns={"ParkNeed": "park_priority"}, inplace=True)
- else:
- raise TypeError("Expected a GeoDataFrame, got Series or another type instead")
-
- print(f"Writing filtered data to GeoJSON: {geojson_path}")
- phl_parks.to_file(geojson_path, driver="GeoJSON")
-
- return phl_parks
-
-
-def park_priority(primary_featurelayer: FeatureLayer) -> FeatureLayer:
- """
- Downloads and processes park priority data, then joins it with the primary feature layer.
-
- Args:
- primary_featurelayer (FeatureLayer): The primary feature layer to join with park priority data.
-
- Returns:
- FeatureLayer: The primary feature layer with park priority data joined.
- """
- park_url: str = get_latest_shapefile_url()
- print(f"Downloading park priority data from: {park_url}")
-
- file_name_prefix: str = "Parkserve"
- target_files: List[str] = [
- file_name_prefix + "_ParkPriorityAreas.shp",
- file_name_prefix + "_ParkPriorityAreas.dbf",
- file_name_prefix + "_ParkPriorityAreas.shx",
- file_name_prefix + "_ParkPriorityAreas.prj",
- file_name_prefix + "_ParkPriorityAreas.CPG",
- file_name_prefix + "_ParkPriorityAreas.sbn",
- file_name_prefix + "_ParkPriorityAreas.sbx",
- ]
- geojson_path: str = "tmp/phl_parks.geojson"
-
- os.makedirs("tmp/", exist_ok=True)
-
- try:
- if os.path.exists(geojson_path):
- print(f"GeoJSON file already exists, loading from {geojson_path}")
- phl_parks: gpd.GeoDataFrame = gpd.read_file(geojson_path)
- else:
- raise pyogrio.errors.DataSourceError(
- "GeoJSON file missing, forcing download."
- )
-
- except (pyogrio.errors.DataSourceError, ValueError) as e:
- print(f"Error loading GeoJSON: {e}. Re-downloading and processing shapefile.")
- if os.path.exists(geojson_path):
- os.remove(geojson_path) # Delete the corrupted GeoJSON if it exists
- phl_parks = download_and_process_shapefile(
- geojson_path, park_url, target_files, file_name_prefix
- )
-
- park_priority_layer: FeatureLayer = FeatureLayer("Park Priority")
- park_priority_layer.gdf = phl_parks
-
- primary_featurelayer.spatial_join(park_priority_layer)
- return primary_featurelayer
diff --git a/data/src/data_utils/phs_properties.py b/data/src/data_utils/phs_properties.py
deleted file mode 100644
index c906c2d1..00000000
--- a/data/src/data_utils/phs_properties.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from classes.featurelayer import FeatureLayer
-from constants.services import PHS_LAYERS_TO_LOAD
-
-def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
- """
- Perform a spatial join between the primary feature layer and the PHS properties layer,
- then update the primary feature layer with a new column 'phs_care_program' indicating
- if the property is part of the PHS care program.
-
- Args:
- primary_featurelayer (FeatureLayer): The primary feature layer to join with the PHS properties layer.
-
- Returns:
- FeatureLayer: The updated primary feature layer with the 'phs_care_program' column.
- """
-
- phs_properties = FeatureLayer(
- name="PHS Properties", esri_rest_urls=PHS_LAYERS_TO_LOAD, cols=["program"]
- )
-
- # Perform spatial join between primary feature layer and PHS properties
- primary_featurelayer.spatial_join(phs_properties)
-
- # Initialize 'phs_care_program' column with default "no" for all rows
- primary_featurelayer.gdf["phs_care_program"] = "No"
-
- # Set 'phs_care_program' to "yes" for matched rows
- primary_featurelayer.gdf.loc[primary_featurelayer.gdf["program"].notna(), "phs_care_program"] = "Yes"
-
- # Rebuild the GeoDataFrame after updates
- primary_featurelayer.rebuild_gdf()
-
- return primary_featurelayer
diff --git a/data/src/data_utils/ppr_properties.py b/data/src/data_utils/ppr_properties.py
deleted file mode 100644
index 48111b35..00000000
--- a/data/src/data_utils/ppr_properties.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import io
-
-import geopandas as gpd
-import requests
-from classes.featurelayer import FeatureLayer
-from constants.services import PPR_PROPERTIES_TO_LOAD
-
-from config.config import USE_CRS
-
-
-def ppr_properties(primary_featurelayer):
- fallback_url = 'https://opendata.arcgis.com/datasets/d52445160ab14380a673e5849203eb64_0.geojson'
-
- try:
-
- ppr_properties = FeatureLayer(
- name="PPR Properties",
- esri_rest_urls=PPR_PROPERTIES_TO_LOAD,
- cols=["PUBLIC_NAME"]
- )
-
- if ppr_properties.gdf is None or ppr_properties.gdf.empty:
- raise ValueError("PPR properties GeoDataFrame is empty or failed to load from Esri REST URL.")
-
- print("Loaded PPR properties from Esri REST URL.")
-
- except Exception as e:
- print(f"Error loading PPR properties from Esri REST URL: {e}")
- print("Falling back to loading from GeoJSON URL.")
-
- response = requests.get(fallback_url)
- response.raise_for_status()
- ppr_properties_gdf = gpd.read_file(io.BytesIO(response.content))
-
- ppr_properties = FeatureLayer(name="PPR Properties")
- ppr_properties.gdf = ppr_properties_gdf
-
- ppr_properties.gdf = ppr_properties.gdf[["public_name", "geometry"]]
-
- ppr_properties.gdf = ppr_properties.gdf.to_crs(USE_CRS)
-
- primary_featurelayer.spatial_join(ppr_properties)
-
- mask = primary_featurelayer.gdf["public_name"].notnull()
-
- count_dropped = mask.sum()
- print(f"Number of PPR properties being dropped: {count_dropped}")
-
- primary_featurelayer.gdf = primary_featurelayer.gdf.drop(primary_featurelayer.gdf[mask].index)
-
- primary_featurelayer.gdf = primary_featurelayer.gdf.drop(columns=["public_name"])
-
- return primary_featurelayer
\ No newline at end of file
diff --git a/data/src/data_utils/priority_level.py b/data/src/data_utils/priority_level.py
deleted file mode 100644
index 33097de3..00000000
--- a/data/src/data_utils/priority_level.py
+++ /dev/null
@@ -1,51 +0,0 @@
-def priority_level(dataset):
- priority_levels = []
- for idx, row in dataset.gdf.iterrows():
- priority_level = ""
-
- # Decision Points
- guncrime_density_percentile = row["gun_crimes_density_percentile"]
- in_phs_landcare = row["phs_care_program"] == "yes"
- has_li_complaint_or_violation = (
- row["li_complaints"] is not None
- and float(row["all_violations_past_year"]) > 0
- )
- very_low_tree_canopy = row["tree_canopy_gap"] >= 0.3
-
- # Updated logic based on percentile values
- if guncrime_density_percentile <= 50:
- # Low Gun Crime Density (Bottom 50%)
- priority_level = "Low"
-
- elif guncrime_density_percentile > 75:
- # High Gun Crime Density (Top 25%)
-
- if has_li_complaint_or_violation:
- priority_level = "High"
- else:
- if in_phs_landcare:
- if very_low_tree_canopy:
- priority_level = "High"
- else:
- priority_level = "Medium"
- else:
- priority_level = "High"
-
- else:
- # Medium Gun Crime Density (Between 50% and 75%)
- if has_li_complaint_or_violation:
- if in_phs_landcare:
- priority_level = "Medium"
- else:
- if very_low_tree_canopy:
- priority_level = "High"
- else:
- priority_level = "Medium"
- else:
- priority_level = "Low"
-
- priority_levels.append(priority_level)
-
- dataset.gdf["priority_level"] = priority_levels
-
- return dataset
diff --git a/data/src/data_utils/rco_geoms.py b/data/src/data_utils/rco_geoms.py
deleted file mode 100644
index 6aa3dca6..00000000
--- a/data/src/data_utils/rco_geoms.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from classes.featurelayer import FeatureLayer
-from constants.services import RCOS_LAYERS_TO_LOAD
-import pandas as pd
-
-pd.set_option("future.no_silent_downcasting", True)
-
-
-def rco_geoms(primary_featurelayer):
- rco_geoms = FeatureLayer(name="RCOs", esri_rest_urls=RCOS_LAYERS_TO_LOAD)
-
- rco_aggregate_cols = [
- "ORGANIZATION_NAME",
- "ORGANIZATION_ADDRESS",
- "PRIMARY_EMAIL",
- "PRIMARY_PHONE",
- ]
-
- rco_use_cols = ["rco_info", "rco_names", "geometry"]
-
- rco_geoms.gdf.loc[:, "rco_info"] = rco_geoms.gdf[rco_aggregate_cols].apply(
- lambda x: "; ".join(map(str, x)), axis=1
- )
-
- rco_geoms.gdf.loc[:, "rco_names"] = rco_geoms.gdf["ORGANIZATION_NAME"]
-
- rco_geoms.gdf = rco_geoms.gdf.loc[:, rco_use_cols].copy()
- rco_geoms.rebuild_gdf()
-
- primary_featurelayer.spatial_join(rco_geoms)
-
- # Collapse columns and aggregate rco_info
- group_columns = [
- col for col in primary_featurelayer.gdf.columns if col not in rco_use_cols
- ]
-
- for col in group_columns:
- # Use .infer_objects() after fillna() to fix the warning
- primary_featurelayer.gdf.loc[:, col] = (
- primary_featurelayer.gdf[col].fillna("").infer_objects(copy=False)
- )
-
- primary_featurelayer.gdf = (
- primary_featurelayer.gdf.groupby(group_columns)
- .agg(
- {
- "rco_info": lambda x: "|".join(map(str, x)),
- "rco_names": lambda x: "|".join(map(str, x)),
- "geometry": "first",
- }
- )
- .reset_index()
- )
-
- primary_featurelayer.gdf.drop_duplicates(inplace=True)
- primary_featurelayer.rebuild_gdf()
-
- return primary_featurelayer
diff --git a/data/src/data_utils/tactical_urbanism.py b/data/src/data_utils/tactical_urbanism.py
deleted file mode 100644
index df15a0f2..00000000
--- a/data/src/data_utils/tactical_urbanism.py
+++ /dev/null
@@ -1,25 +0,0 @@
-def tactical_urbanism(dataset):
- unsafe_words = [
- "dangerous",
- ]
-
- tactical_urbanism_values = []
-
- for idx, row in dataset.gdf.iterrows():
- li_complaints_lower = str(row["li_complaints"]).lower().split(" ")
- contains_unsafe_word = any(word in li_complaints_lower for word in unsafe_words)
-
- if (
- row["parcel_type"] == "Land"
- and row["unsafe_building"] == "N"
- and row["imm_dang_building"] == "N"
- and not contains_unsafe_word
- ):
- tactical_urbanism = "Yes"
- else:
- tactical_urbanism = "No"
-
- tactical_urbanism_values.append(tactical_urbanism)
-
- dataset.gdf["tactical_urbanism"] = tactical_urbanism_values
- return dataset
diff --git a/data/src/data_utils/tree_canopy.py b/data/src/data_utils/tree_canopy.py
deleted file mode 100644
index bc133893..00000000
--- a/data/src/data_utils/tree_canopy.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import requests
-import io
-import zipfile
-import geopandas as gpd
-from classes.featurelayer import FeatureLayer
-from config.config import USE_CRS
-
-
-def tree_canopy(primary_featurelayer):
- tree_url = (
- "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/pa.zip.zip"
- )
-
- tree_response = requests.get(tree_url)
-
- with io.BytesIO(tree_response.content) as f:
- with zipfile.ZipFile(f, "r") as zip_ref:
- zip_ref.extractall("tmp/")
-
- pa_trees = gpd.read_file("tmp/pa.shp")
- pa_trees = pa_trees.to_crs(USE_CRS)
- phl_trees = pa_trees[pa_trees["county"] == "Philadelphia County"]
- phl_trees = phl_trees[["tc_gap", "geometry"]]
-
- phl_trees.rename(columns={"tc_gap": "tree_canopy_gap"}, inplace=True)
-
- tree_canopy = FeatureLayer("Tree Canopy")
- tree_canopy.gdf = phl_trees
-
- primary_featurelayer.spatial_join(tree_canopy)
-
- return primary_featurelayer
diff --git a/data/src/data_utils/unsafe_buildings.py b/data/src/data_utils/unsafe_buildings.py
deleted file mode 100644
index b44edd00..00000000
--- a/data/src/data_utils/unsafe_buildings.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from classes.featurelayer import FeatureLayer
-from constants.services import UNSAFE_BUILDINGS_QUERY
-
-
-def unsafe_buildings(primary_featurelayer):
- unsafe_buildings = FeatureLayer(
- name="Unsafe Buildings",
- carto_sql_queries=UNSAFE_BUILDINGS_QUERY,
- use_wkb_geom_field="the_geom",
- cols=["opa_account_num"],
- )
-
- unsafe_buildings.gdf.loc[:, "unsafe_building"] = "Y"
-
- unsafe_buildings.gdf = unsafe_buildings.gdf.rename(
- columns={"opa_account_num": "opa_number"}
- )
-
- primary_featurelayer.opa_join(
- unsafe_buildings.gdf,
- "opa_number",
- )
-
- primary_featurelayer.gdf.loc[:, "unsafe_building"] = primary_featurelayer.gdf[
- "unsafe_building"
- ].fillna("N")
-
- return primary_featurelayer
diff --git a/data/src/data_utils/utils.py b/data/src/data_utils/utils.py
deleted file mode 100644
index b7b9ef4e..00000000
--- a/data/src/data_utils/utils.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import os
-import re
-
-import requests
-
-
-def mask_password(value: str):
- """remove the password from this postgresql connect string so we don't write it to logs, etc.
-
- Args:
- value (str): the unmasked string containing one or more postgres connect string.
-
- Returns:
- _type_: the string with the password replaced by MASKED
- """
- return re.sub(":\w+@", ":MASKED@", value)
-
-
-def save_stream_url(url: str) -> str:
- """download the file from this url to the tmp/ directory by streaming in a memory-friendly way.
- If local file already exists, use it and don't download.
- Args:
- url (str): the url of the zip file
-
- Returns:
- str: the relative local path of the saved zip file
- """
- local_filename = "tmp/" + url.split('/')[-1]
- if os.path.exists(local_filename):
- return local_filename
-
- with requests.get(url, stream=True) as r:
- r.raise_for_status()
- with open(local_filename, 'wb') as f:
- for chunk in r.iter_content(chunk_size=8192):
- # If you have chunk encoded response uncomment if
- # and set chunk_size parameter to None.
- #if chunk:
- f.write(chunk)
- f.close()
- r.close()
- return local_filename
diff --git a/data/src/data_utils/vacant_properties.py b/data/src/data_utils/vacant_properties.py
deleted file mode 100644
index d6573218..00000000
--- a/data/src/data_utils/vacant_properties.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from classes.featurelayer import FeatureLayer, google_cloud_bucket
-from constants.services import VACANT_PROPS_LAYERS_TO_LOAD
-import geopandas as gpd
-from config.config import USE_CRS
-from io import BytesIO
-
-import pandas as pd
-
-
-def load_backup_data_from_gcs(file_name: str) -> gpd.GeoDataFrame:
- bucket = google_cloud_bucket()
- blob = bucket.blob(file_name)
- if not blob.exists():
- raise FileNotFoundError(f"File {file_name} not found in the GCS bucket.")
-
- file_bytes = blob.download_as_bytes()
- try:
- gdf = gpd.read_file(BytesIO(file_bytes))
- except Exception as e:
- raise ValueError(f"Error reading GeoJSON file: {e}")
-
- print("Loaded backup data from GCS.")
-
- # Ensure column names are consistent
- gdf = gdf.rename(
- columns={
- "ADDRESS": "address",
- "OWNER1": "owner_1",
- "OWNER2": "owner_2",
- "BLDG_DESC": "building_description",
- "CouncilDistrict": "council_district",
- "ZoningBaseDistrict": "zoning_base_district",
- "ZipCode": "zipcode",
- "OPA_ID": "opa_id",
- }
- )
-
- return gdf
-
-
-def check_null_percentage(df: pd.DataFrame, threshold: float = 0.05):
- """Checks if any column in the dataframe has more than the given threshold of null values."""
- null_percentages = df.isnull().mean()
- for col, pct in null_percentages.items():
- if col not in ["owner1", "owner2"] and pct > threshold:
- raise ValueError(
- f"Column '{col}' has more than {threshold * 100}% null values ({pct * 100}%)."
- )
-
-
-def vacant_properties() -> FeatureLayer:
- vacant_properties = FeatureLayer(
- name="Vacant Properties",
- esri_rest_urls=VACANT_PROPS_LAYERS_TO_LOAD,
- cols=[
- "ADDRESS",
- "OWNER1",
- "OWNER2",
- "BLDG_DESC",
- "COUNCILDISTRICT",
- "ZONINGBASEDISTRICT",
- "ZIPCODE",
- "OPA_ID",
- "parcel_type",
- ],
- )
-
- # Rename columns for consistency in the original data
- vacant_properties.gdf = vacant_properties.gdf.rename(
- columns={
- "ADDRESS": "address",
- "OWNER1": "owner_1",
- "OWNER2": "owner_2",
- "BLDG_DESC": "building_description",
- "COUNCILDISTRICT": "council_district",
- "ZONINGBASEDISTRICT": "zoning_base_district",
- "ZIPCODE": "zipcode",
- "OPA_ID": "opa_id",
- }
- )
-
- vacant_land_gdf = vacant_properties.gdf[
- vacant_properties.gdf["parcel_type"] == "Land"
- ]
- print(f"Vacant land data size: {len(vacant_land_gdf)} rows.")
-
- if len(vacant_land_gdf) < 20000:
- print("Vacant land data is below the threshold. Loading backup data from GCS.")
- backup_gdf = load_backup_data_from_gcs("vacant_indicators_land_06_2024.geojson")
-
- # Ensure CRS is consistent with project-wide CRS (USE_CRS)
- if backup_gdf.crs != USE_CRS:
- print(f"Reprojecting backup data from {backup_gdf.crs} to {USE_CRS}")
- backup_gdf = backup_gdf.to_crs(USE_CRS)
-
- # Ensure CRS is the same
- if backup_gdf.crs != vacant_properties.gdf.crs:
- backup_gdf = backup_gdf.to_crs(vacant_properties.gdf.crs)
-
- # Map backup dataset column names to match the original dataset
- backup_gdf = backup_gdf.rename(
- columns={
- "owner_1": "owner1",
- "owner_2": "owner2",
- "building_description": "bldg_desc",
- "council_district": "councildistrict",
- "zoning_base_district": "zoningbasedistrict",
- }
- )
-
- # Set parcel_type to "Land" for backup data
- backup_gdf["parcel_type"] = "Land"
-
- # Select only the columns present in the original dataset
- backup_gdf = backup_gdf[vacant_properties.gdf.columns]
-
- # Ensure all necessary columns are present in backup data
- for col in vacant_properties.gdf.columns:
- if col not in backup_gdf.columns:
- backup_gdf[col] = None
-
- # Check for column mismatches between original and backup datasets
- for col in vacant_properties.gdf.columns:
- if vacant_properties.gdf[col].dtype != backup_gdf[col].dtype:
- print(
- f"Warning: Data type mismatch in column '{col}'. Original: {vacant_properties.gdf[col].dtype}, Backup: {backup_gdf[col].dtype}"
- )
-
- # Verify if backup data contains more than expected null values
- check_null_percentage(backup_gdf)
-
- # Remove existing Land data
- vacant_properties.gdf = vacant_properties.gdf[
- vacant_properties.gdf["parcel_type"] != "Land"
- ]
-
- # Concatenate the backup data with the existing data
- print(f"Appending backup data ({len(backup_gdf)} rows) to the existing data.")
- vacant_properties.gdf = pd.concat(
- [vacant_properties.gdf, backup_gdf], ignore_index=True
- )
-
- # Ensure concatenated data is still a GeoDataFrame
- vacant_properties.gdf = gpd.GeoDataFrame(
- vacant_properties.gdf, geometry="geometry"
- )
-
- vacant_properties.gdf.dropna(subset=["opa_id"], inplace=True)
-
- # Final null value check before returning
- check_null_percentage(vacant_properties.gdf)
-
- # Final column renaming and selection
- vacant_properties.gdf = vacant_properties.gdf.rename(
- columns={
- "owner1": "owner_1",
- "owner2": "owner_2",
- "councildistrict": "council_district",
- "zoningbasedistrict": "zoning_base_district",
- }
- )
-
- # Select only the final columns needed
- final_columns = [
- "address",
- "owner_1",
- "owner_2",
- "council_district",
- "zoning_base_district",
- "zipcode",
- "opa_id",
- "parcel_type",
- "geometry",
- ]
-
- vacant_properties.gdf = vacant_properties.gdf[final_columns]
-
- # Ensure concatenated data is still a GeoDataFrame
- vacant_properties.gdf = gpd.GeoDataFrame(vacant_properties.gdf, geometry="geometry")
-
- before_drop = vacant_properties.gdf.shape[0]
- vacant_properties.gdf = vacant_properties.gdf.drop_duplicates(subset="opa_id")
- after_drop = vacant_properties.gdf.shape[0]
- print(f"Duplicate vacant properties dropped: {before_drop - after_drop}")
-
- return vacant_properties
diff --git a/data/src/main.py b/data/src/main.py
index bc97fa43..096d85cc 100644
--- a/data/src/main.py
+++ b/data/src/main.py
@@ -2,6 +2,8 @@
import traceback
import pandas as pd
+
+from config.config import tiles_file_id_prefix
from config.psql import conn
from new_etl.classes.data_diff import DiffReport
from new_etl.classes.slack_reporters import (
@@ -10,6 +12,7 @@
send_pg_stats_to_slack,
)
from new_etl.data_utils import (
+ access_process,
city_owned_properties,
community_gardens,
conservatorship,
@@ -24,10 +27,12 @@
li_violations,
nbhoods,
negligent_devs,
+ opa_properties,
owner_type,
park_priority,
phs_properties,
ppr_properties,
+ priority_level,
pwd_parcels,
rco_geoms,
tactical_urbanism,
@@ -35,8 +40,7 @@
unsafe_buildings,
vacant_properties,
)
-
-from config.config import tiles_file_id_prefix
+from new_etl.database import to_postgis_with_schema
# Ensure the directory containing awkde is in the Python path
awkde_path = "/usr/src/app"
diff --git a/data/src/script.py b/data/src/script.py
deleted file mode 100644
index cf42ace6..00000000
--- a/data/src/script.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import sys
-import time
-
-from classes.backup_archive_database import BackupArchiveDatabase
-from classes.diff_report import DiffReport
-from config.config import FORCE_RELOAD, tiles_file_id_prefix
-from config.psql import conn
-from data_utils.access_process import access_process
-from data_utils.city_owned_properties import city_owned_properties
-from data_utils.community_gardens import community_gardens
-from data_utils.conservatorship import conservatorship
-from data_utils.contig_neighbors import contig_neighbors
-from data_utils.deliquencies import deliquencies
-from data_utils.dev_probability import dev_probability
-from data_utils.drug_crimes import drug_crimes
-from data_utils.gun_crimes import gun_crimes
-from data_utils.imm_dang_buildings import imm_dang_buildings
-from data_utils.l_and_i import l_and_i
-from data_utils.owner_type import owner_type
-from data_utils.nbhoods import nbhoods
-from data_utils.negligent_devs import negligent_devs
-from data_utils.opa_properties import opa_properties
-from data_utils.park_priority import park_priority
-from data_utils.phs_properties import phs_properties
-from data_utils.ppr_properties import ppr_properties
-from data_utils.priority_level import priority_level
-from data_utils.rco_geoms import rco_geoms
-from data_utils.tactical_urbanism import tactical_urbanism
-from data_utils.tree_canopy import tree_canopy
-from data_utils.unsafe_buildings import unsafe_buildings
-from data_utils.vacant_properties import vacant_properties
-
-import traceback
-
-from classes.slack_error_reporter import send_error_to_slack
-
-# Ensure the directory containing awkde is in the Python path
-awkde_path = "/usr/src/app"
-if awkde_path not in sys.path:
- sys.path.append(awkde_path)
-
-try:
- services = [
- city_owned_properties,
- phs_properties,
- l_and_i,
- rco_geoms,
- tree_canopy,
- nbhoods,
- gun_crimes,
- drug_crimes,
- deliquencies,
- opa_properties,
- unsafe_buildings,
- imm_dang_buildings,
- tactical_urbanism,
- conservatorship,
- owner_type,
- community_gardens,
- park_priority,
- ppr_properties,
- contig_neighbors,
- dev_probability,
- negligent_devs,
- ]
-
- # backup sql schema if we are reloading data
- backup: BackupArchiveDatabase = None
- if FORCE_RELOAD:
- # first archive any remaining backup that may exist from a previous run that errored
- backup = BackupArchiveDatabase()
- if backup.is_backup_schema_exists():
- backup.archive_backup_schema()
- conn.commit()
- time.sleep(1) # make sure we get a different timestamp
- backup = (
- BackupArchiveDatabase()
- ) # create a new one so we get a new timestamp
-
- backup.backup_schema()
- conn.commit()
-
- # Load Vacant Property Data
- dataset = vacant_properties()
-
- # Load and join other datasets
- for service in services:
- dataset = service(dataset)
-
- before_drop = dataset.gdf.shape[0]
- dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
- after_drop = dataset.gdf.shape[0]
- print(
- f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}"
- )
-
- # Add Priority Level
- dataset = priority_level(dataset)
-
- # Print the distribution of "priority_level"
- distribution = dataset.gdf["priority_level"].value_counts()
- print("Distribution of priority level:")
- print(distribution)
-
- # Add Access Process
- dataset = access_process(dataset)
-
- # Print the distribution of "access_process"
- distribution = dataset.gdf["access_process"].value_counts()
- print("Distribution of access process:")
- print(distribution)
-
- before_drop = dataset.gdf.shape[0]
- dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
- after_drop = dataset.gdf.shape[0]
- print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}")
-
- # back up old tiles file whether we are reloading data or not
- if backup is None:
- backup = BackupArchiveDatabase()
- backup.backup_tiles_file()
-
- # Finalize in Postgres
- dataset.gdf.to_postgis(
- "vacant_properties_end", conn, if_exists="replace", index=False
- )
-
- conn.commit()
-
- # Post to GCP
- dataset.build_and_publish(tiles_file_id_prefix)
-
- # if we are reloading, run the diff report, then archive the backup and finally prune old archives
- if FORCE_RELOAD:
- diff_report = DiffReport(timestamp_string=backup.timestamp_string)
- diff_report.run()
- backup.archive_backup_schema()
- conn.commit()
- backup.prune_old_archives()
- conn.commit()
-
- conn.close()
-
-except Exception as e:
- error_message = f"Error in backend job: {str(e)}\n\n{traceback.format_exc()}"
- send_error_to_slack(error_message)
- raise # Optionally re-raise the exception
diff --git a/data/src/test/conftest.py b/data/src/test/conftest.py
index a447bdbb..c7d2608a 100644
--- a/data/src/test/conftest.py
+++ b/data/src/test/conftest.py
@@ -8,7 +8,9 @@
def mock_gcp_bucket(monkeypatch):
mock_bucket = MagicMock(spec=Bucket)
- monkeypatch.setattr("classes.featurelayer.google_cloud_bucket", lambda: mock_bucket)
+ monkeypatch.setattr(
+ "new_etl.classes.featurelayer.google_cloud_bucket", lambda: mock_bucket
+ )
return mock_bucket
diff --git a/data/src/test/test_data_utils.py b/data/src/test/test_data_utils.py
index eeed7740..cc1e6c6c 100644
--- a/data/src/test/test_data_utils.py
+++ b/data/src/test/test_data_utils.py
@@ -4,12 +4,10 @@
from unittest.mock import MagicMock, Mock, patch
import geopandas as gpd
-from data_utils.park_priority import get_latest_shapefile_url, park_priority
-from data_utils.ppr_properties import ppr_properties
-from data_utils.vacant_properties import vacant_properties
from shapely.geometry import Point
from config.config import USE_CRS
+from new_etl.data_utils.park_priority import get_latest_shapefile_url, park_priority
class TestDataUtils(unittest.TestCase):
@@ -37,7 +35,9 @@ def setUpClass(cls):
def setUp(self):
# Set up the mocks that will be used in each test
- self.patcher1 = patch("data_utils.vacant_properties.google_cloud_bucket")
+ self.patcher1 = patch(
+ "new_etl.data_utils.vacant_properties.google_cloud_bucket"
+ )
self.patcher2 = patch("geopandas.read_file")
self.mock_gcs = self.patcher1.start()
@@ -66,7 +66,7 @@ def test_get_latest_shapefile_url(self):
self.assertTrue(url.startswith("https://"))
self.assertTrue(url.endswith(".zip"))
- @patch("data_utils.park_priority.requests.get")
+ @patch("new_etl.data_utils.park_priority.requests.get")
def test_get_latest_shapefile_url_mock(self, mock_get):
"""
Test the get_latest_shapefile_url function.
@@ -81,7 +81,7 @@ def test_get_latest_shapefile_url_mock(self, mock_get):
self.assertEqual(url, "https://example.com/shapefile.zip")
@patch(
- "data_utils.park_priority.requests.get"
+ "new_etl.data_utils.park_priority.requests.get"
) # Mock requests.get globally in park_priority
@patch("geopandas.read_file")
@patch("geopandas.GeoDataFrame.to_file") # Mock to_file to prevent actual writing
@@ -159,18 +159,6 @@ def test_park_priority(
self.assertEqual(result, mock_primary_layer)
- def test_ppr_properties(self):
- """
- Test the ppr properties layer. Simply construct the class for now to see if it works.
- """
- ppr_properties(vacant_properties())
-
- def test_vacant_properties(self):
- """
- Test the vacant properties layer. Simply construct the class to see if it works.
- """
- vacant_properties()
-
if __name__ == "__main__":
unittest.main()
diff --git a/data/src/test/test_diff_backup.py b/data/src/test/test_diff_backup.py
deleted file mode 100644
index f3dfb9c3..00000000
--- a/data/src/test/test_diff_backup.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import os
-from datetime import datetime
-
-import pytest
-from classes.backup_archive_database import (
- BackupArchiveDatabase,
- backup_schema_name,
- date_time_format,
-)
-from classes.diff_report import DiffReport
-from classes.featurelayer import google_cloud_bucket
-from config.psql import conn, local_engine
-from sqlalchemy import inspect
-
-pytestmark = pytest.mark.skip(
- reason="Skipping tests. The tests in test_diff_backup are designed for stateful, manual testing."
-)
-
-
-class TestDiffBackup:
- """
- test methods for data diffing and backing up
- """
-
- backup = BackupArchiveDatabase()
-
- def test_backup(self):
- """
- test the backup workflow without archiving
- """
- assert backup_schema_name not in inspect(local_engine).get_schema_names()
- TestDiffBackup.backup.backup_schema()
- assert backup_schema_name in inspect(local_engine).get_schema_names()
-
- def test_archive(self):
- """
- test the backup archiving
- """
- TestDiffBackup.backup.archive_backup_schema()
- conn.commit()
- assert backup_schema_name not in inspect(local_engine).get_schema_names()
-
- def test_prune_old_archives(self):
- """
- test dropping backups that are too old
- """
- TestDiffBackup.backup.prune_old_archives()
- conn.commit()
-
- def test_diff(self):
- """
- test the diff, assumes the backup_ table is there
- """
- diff = DiffReport(timestamp_string=TestDiffBackup.backup.timestamp_string)
- diff.run()
-
- def test_generate_table_detail_report(self):
- """print out the html for the vacant_properties diff"""
- diff = DiffReport()
- html = diff.generate_table_detail_report("vacant_properties")
- print(html)
-
- def test_detail_report(self):
- """print out the url for the generated and uploaded detail report for vacant_properties diff"""
- diff = DiffReport(timestamp_string=datetime.now().strftime(date_time_format))
- url = diff.detail_report("vacant_properties")
- print(url)
-
- @pytest.mark.skipif(
- not os.getenv("INTEGRATION_TESTING"),
- reason="For manual integration testing only. Export INTEGRATION_TESTING=True to run",
- )
- def test_upload_to_gcp(self):
- """test a simple upload to Google cloud"""
- bucket = google_cloud_bucket()
- blob = bucket.blob("test.txt")
- blob.upload_from_string("test")
-
- @pytest.mark.skipif(
- not os.getenv("INTEGRATION_TESTING"),
- reason="For manual integration testing only. Export INTEGRATION_TESTING=True to run",
- )
- def test_send_report_to_slack(self):
- """CAREFUL: if configured, this will send a message to Slack, potentially our prod channel"""
- diff = DiffReport()
- diff.report = "This is the report"
- diff.send_report_to_slack()
-
- @pytest.mark.skipif(
- not os.getenv("INTEGRATION_TESTING"),
- reason="For manual integration testing only. Export INTEGRATION_TESTING=True to run",
- )
- def test_email_report(self):
- """CAREFUL: if configured, this will send email if configured"""
- diff = DiffReport()
- diff.report = "This is the report"
- diff.email_report()
-
- def test_is_backup_schema_exists(self):
- """test method for whether the backup schema exists"""
- if TestDiffBackup.backup.is_backup_schema_exists():
- TestDiffBackup.backup.archive_backup_schema()
- conn.commit()
- assert not TestDiffBackup.backup.is_backup_schema_exists()
- else:
- TestDiffBackup.backup.backup_schema()
- assert TestDiffBackup.backup.is_backup_schema_exists()
- TestDiffBackup.backup.archive_backup_schema()
- conn.commit()
- assert not TestDiffBackup.backup.is_backup_schema_exists()
-
- def test_backup_tiles_file(self):
- """test backing up the tiles file"""
- TestDiffBackup.backup.backup_tiles_file()
diff --git a/data/src/test/test_slack_error_reporter.py b/data/src/test/test_slack_error_reporter.py
index 4a30653b..4ff2dbea 100644
--- a/data/src/test/test_slack_error_reporter.py
+++ b/data/src/test/test_slack_error_reporter.py
@@ -5,17 +5,17 @@
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..")
-from classes.slack_error_reporter import (
- send_error_to_slack,
-) # Ensure correct file import
+from new_etl.classes.slack_reporters import (
+ send_error_to_slack, # Ensure correct file import
+)
class TestSlackNotifier(unittest.TestCase):
@patch(
- "classes.slack_error_reporter.WebClient.chat_postMessage"
+ "new_etl.classes.slack_reporters.WebClient.chat_postMessage"
) # Correct patching
@patch(
- "classes.slack_error_reporter.os.getenv", return_value="mock_slack_token"
+ "new_etl.classes.slack_reporters.os.getenv", return_value="mock_slack_token"
) # Correct patching
def test_send_error_to_slack(self, _mock_getenv, mock_slack_post):
"""Test that Slack error reporting is triggered correctly."""
@@ -33,10 +33,10 @@ def test_send_error_to_slack(self, _mock_getenv, mock_slack_post):
)
@patch(
- "classes.slack_error_reporter.WebClient.chat_postMessage"
+ "new_etl.classes.slack_reporters.WebClient.chat_postMessage"
) # Correct patching
@patch(
- "classes.slack_error_reporter.os.getenv", return_value=None
+ "new_etl.classes.slack_reporters.os.getenv", return_value=None
) # Simulate missing Slack token
def test_no_error_no_slack_message(self, _mock_getenv, mock_slack_post):
"""Test that Slack notification is not triggered if there's no error."""
diff --git a/data/src/test/test_utils.py b/data/src/test/test_utils.py
index 9ea090f4..d8dd1bff 100644
--- a/data/src/test/test_utils.py
+++ b/data/src/test/test_utils.py
@@ -1,6 +1,6 @@
import re
-from data_utils import utils
+from new_etl.data_utils import utils
class TestUtils: