Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
7c4adde
test: get opa properties stats validator running (unit tests still ne…
nlebovits Jun 21, 2025
9b83f3d
test: add unit and schema tests for opa properties validator
nlebovits Jun 21, 2025
d4854fd
refactor: ruff
nlebovits Jun 21, 2025
9baffb8
test: add validator for pwd parcels; make sure loaders handle carto d…
nlebovits Jun 21, 2025
806682f
test: abstract test data to conftest.py
nlebovits Jun 21, 2025
c76f364
test: add unit + schema tests for vacant properties validator; abstra…
nlebovits Jun 21, 2025
8c133fd
refactor: separate testing by concern
nlebovits Jun 21, 2025
09f6142
test: add stats validator + unit and schema tests for it
nlebovits Jun 21, 2025
660a843
refactor: abstract repeated code into base validator
nlebovits Jun 21, 2025
f4a0970
chore: ignore cursor files
nlebovits Jun 21, 2025
5f438b4
test: add nbhoods validator plus unit + schema tests
nlebovits Jun 21, 2025
e56682c
test: add validation for rco geoms; better handling of base test data
nlebovits Jun 22, 2025
dfd0c87
test: add validation for city owned properties
nlebovits Jun 22, 2025
02dea3a
test: add validator and validator test for phs properties
nlebovits Jun 22, 2025
99e2e4f
test: add validator validator tests for community gardens
nlebovits Jun 22, 2025
d8c8cf4
test: add validator and validator tests for ppr properties
nlebovits Jun 22, 2025
aad0473
feat: enhance data validation framework with debugging capabilities
nlebovits Jun 22, 2025
8d6afd8
refactor: ruff
nlebovits Jun 22, 2025
7dbaada
style: ruff
nlebovits Jun 24, 2025
4542a1b
fix: clean up issues with validator tests
nlebovits Jun 24, 2025
df24e85
feat: add script to quickly test if services are working
nlebovits Jun 24, 2025
9de5b92
fix: remove redundant columns in vacant properties validator
nlebovits Jun 25, 2025
3f5b813
test: add validator, validator unit test for unsafe buildings
nlebovits Jun 25, 2025
22c19bf
test: add validator, validator unit test for tree canopy gap
nlebovits Jun 25, 2025
df3006a
fix: remove obsolete DOR parcels dataset
nlebovits Jun 25, 2025
ff4cc52
test: add validator, validator unit tests for imm dang buildings
nlebovits Jun 25, 2025
398fc93
test: add validation, validator unit tests for park priority scores
nlebovits Jun 25, 2025
31add07
test: add validator, validator unit tests for kde base, gun crimes kde
nlebovits Jun 27, 2025
02cef5b
fix: place validators in the right folder
nlebovits Jun 27, 2025
aabc5ae
test: add drug crime kde validator, validator unit tests
nlebovits Jun 27, 2025
bcda9bd
feat: add pylint similarity checking to precommit hook
nlebovits Jun 27, 2025
325d28d
test: finish kde validators, add validator unit tests
nlebovits Jun 27, 2025
fbf5c66
test: add li violations validtor, validator test (stats violator missing
nlebovits Jun 27, 2025
bf73e7d
test: add validator, validator unit tests for access process
nlebovits Jun 27, 2025
81f7413
fix: rebase against staging, incorporate changes
nlebovits Jun 28, 2025
0ac5bae
test: add delinquencies validator, validator unit tests
nlebovits Jun 28, 2025
f303e2d
feat: update test_service.py to include all possible services
nlebovits Jun 28, 2025
0e3647b
test: add validator, validator unit tests for contig neighbors
nlebovits Jun 28, 2025
4ef9c90
style: ruff
nlebovits Jun 28, 2025
8884dbd
test: add conservatorship validator, validator unit tests
nlebovits Jun 28, 2025
8e9d014
test: add tactical urbanism validator, validator unit tests
nlebovits Jun 28, 2025
7d12661
test: add validator for priority level, add validator unit tests, cle…
nlebovits Jun 29, 2025
2a19b43
test: add dev probability validator, validator unit tests
nlebovits Jun 29, 2025
3ca4f09
style: upgrade ruff to match precommit hook
nlebovits Jun 29, 2025
7d3c98c
fix: add backup vacant land and buildings data in local file for easy…
nlebovits Jun 29, 2025
5501a83
test: finalize validation tests
nlebovits Jun 29, 2025
a881bcf
fix: resolve merge conflicts
nlebovits Jun 29, 2025
b43c23b
fix: remove obsolete files
nlebovits Jun 29, 2025
fbe7b07
feat: add documentation for backup data
nlebovits Jun 29, 2025
0e54a0f
fix: cleanup backend instructions
nlebovits Jun 29, 2025
46c9d85
fix: update backend docs, fix pyproject.toml
nlebovits Jun 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
backup_data/ filter=lfs diff=lfs merge=lfs -text
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,6 @@ data/src/app/service-account-key.json

# Cached and temporary data files from pipeline
storage/

# cursor
.cursor/
10 changes: 9 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,15 @@ repos:
echo "All MI grades are B or above."'
language: system
files: \.py$
pass_filenames: true
pass_filenames:
true
# Code similarity detection
- id: pylint-similarities
name: Code similarity detection (non-blocking)
entry: bash -c 'echo "Checking for code similarities in data/src/..."; pylint --disable=all --enable=similarities --score=no data/src/ || true'
language: system
files: ^data/src/.*\.py$
pass_filenames: false
- repo: https://github.com/jendrikseipp/vulture
rev: 'v2.3'
hooks:
Expand Down
10 changes: 10 additions & 0 deletions data/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Environment variables for Clean and Green Philly backend
# Copy this file to data/.env and fill in your actual values

# Google Cloud Platform credentials
CLEAN_GREEN_GOOGLE_KEY=your-api-key-here
GOOGLE_CLOUD_BUCKET_NAME=your-bucket-name-here
GOOGLE_CLOUD_PROJECT=your-project-id

# Slack integration for diff reporting
CAGP_SLACK_API_TOKEN=your-slack-token
15 changes: 15 additions & 0 deletions data/backup_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
## Backup Data

This folder contains 1) backup vacant properties data from June 2024, the last time these data were reasonably accurate; and 2) the final outputs of our pipeline before project shutdown in July of 2025.

### Vacancy Data

The land backup file contains vacant land data sent to us by the Department of Licenses and Inspections, corresponding to the last reasonably complete dataset on vacant land prior to the City [no longer collecting vacany data](https://www.inquirer.com/opinion/commentary/mayor-parker-housing-plan-missing-data-20250625.html).

The buildings backup are data that we collected ourselves in June of 2024. They are likely missing about a thousand or more buildings, as we hadn't realized at the time that the buildings dataset was corrupted, too, but they are the best data we have available under the circumstances.

Combined, these represent about 34,000 properties. The pipeline is configured to run using these backup data unless the City's APIs suddenly start returning data above the expected threshold again.

### Pipeline Outputs

As it's currently confiured, the pipeline will return new data for everything _except_ the vacant properties themselves, for which it uses our June of 2024 backups. This means that all associated data are the currently-available numbers from their corresponding services, but we have no way to update the vacant properties data themselves. We have stored these here in both GeoParquet format (representing all 580,000+ properties in Philadelphia) and the PMtiles that we use to visualize vacant properties on the website (representing ~34,000 vacant properties from June of 2024, with the rest of the data from July of 2025).
Binary file not shown.
Binary file added data/backup_data/land_backup_2024_06_24.parquet
Binary file not shown.
1 change: 1 addition & 0 deletions data/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ services:
- CAGP_SLACK_API_TOKEN
volumes:
- ./src:/app/src
- ./backup_data:/app/backup_data
- ~/.config/gcloud/application_default_credentials.json:/app/service-account-key.json
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
Expand Down
4 changes: 3 additions & 1 deletion data/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ dependencies = [
"networkx~=3.4.2",
"pandas==2.2.2",
"pandera~=0.24.0",
"pre-commit>=4.2.0",
"pyarrow~=18.1.0",
"pydantic==2.8.2",
"pylint>=3.3.7",
"rasterio~=1.4.3",
"requests~=2.32.3",
"scikit-learn~=1.6.0",
Expand All @@ -38,7 +40,7 @@ dev = [
"pytest~=8.3.4",
"vulture~=2.14",
"radon~=6.0.1",
"ruff~=0.8.2",
"ruff~=0.12.0",
]

[tool.mypy]
Expand Down
66 changes: 40 additions & 26 deletions data/src/classes/file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
from datetime import datetime
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import List

import geopandas as gpd
from tqdm import tqdm

from src.config.config import CACHE_FRACTION, ROOT_DIRECTORY
from src.config.config import CACHE_FRACTION, ROOT_DIRECTORY, get_logger

print(f"Root directory is {ROOT_DIRECTORY}")

Expand Down Expand Up @@ -120,10 +121,9 @@ def check_source_cache_file_exists(
table_name (str): The name of the table of source data.
load_type (LoadType): The destination type of the file (either SOURCE_CACHE or PIPELINE_CACHE).
"""
cache_logger = get_logger("cache")
start_time = time.time()
print(
f" FileManager.check_source_cache_file_exists: Checking for {table_name}"
)
cache_logger.info(f"Checking for {table_name}")

directory = (
self.source_cache_directory
Expand All @@ -140,8 +140,8 @@ def check_source_cache_file_exists(
result = len(files) > 0
total_time = time.time() - start_time

print(
f" FileManager.check_source_cache_file_exists: Found {len(files)} files in {glob_time:.2f}s (total: {total_time:.2f}s)"
cache_logger.info(
f"Found {len(files)} files in {glob_time:.2f}s (total: {total_time:.2f}s)"
)
return result

Expand All @@ -154,10 +154,9 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
GeoDataFrame: The dataframe loaded from the most recent cached file.
None: If no files exist for the given table name.
"""
cache_logger = get_logger("cache")
start_time = time.time()
print(
f" FileManager.get_most_recent_cache: Loading most recent cache for {table_name}"
)
cache_logger.info(f"Loading most recent cache for {table_name}")

# Use glob pattern matching for more efficient file searching
pattern = os.path.join(self.source_cache_directory, f"*{table_name}*.parquet")
Expand All @@ -167,19 +166,19 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
glob_time = time.time() - glob_start

if not cached_files:
print(" FileManager.get_most_recent_cache: No cached files found")
cache_logger.info("No cached files found")
return None

# Get the most recent file by modification time
mtime_start = time.time()
most_recent_file = max(cached_files, key=os.path.getmtime)
mtime_time = time.time() - mtime_start

print(
f" FileManager.get_most_recent_cache: Found {len(cached_files)} files, most recent: {os.path.basename(most_recent_file)}"
cache_logger.info(
f"Found {len(cached_files)} files, most recent: {os.path.basename(most_recent_file)}"
)
print(
f" FileManager.get_most_recent_cache: Glob took {glob_time:.2f}s, mtime check took {mtime_time:.2f}s"
cache_logger.info(
f"Glob took {glob_time:.2f}s, mtime check took {mtime_time:.2f}s"
)

# Load the parquet file
Expand All @@ -188,8 +187,8 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
load_time = time.time() - load_start

total_time = time.time() - start_time
print(
f" FileManager.get_most_recent_cache: Parquet load took {load_time:.2f}s (total: {total_time:.2f}s)"
cache_logger.info(
f"Parquet load took {load_time:.2f}s (total: {total_time:.2f}s)"
)

return gdf
Expand All @@ -205,7 +204,10 @@ def load_gdf(
file_type (FileType): The type of the file (GEOJSON or PARQUET).
load_type (LoadType): The destination type of the file (TEMP or CACHE).
"""
cache_logger = get_logger("cache")
file_path = self.get_file_path(file_name, load_type, file_type)
cache_logger.info(f"Loading {file_name} from {file_path}")

if os.path.exists(file_path):
gdf = (
gpd.read_parquet(file_path)
Expand Down Expand Up @@ -234,37 +236,39 @@ def save_gdf(
file_type (FileType): The type of the file (GEOJSON or PARQUET).
load_type (LoadType): The destination type of the file (TEMP or CACHE).
"""
cache_logger = get_logger("cache")
cache_logger.info(f"Saving {file_name} to {load_type.value}/{file_type.value}")
start_time = time.time()
print(f" FileManager.save_gdf: Starting save for {file_name}")
cache_logger.info(f"Starting save for {file_name}")

file_path = self.get_file_path(file_name, load_type, file_type)
print(f" FileManager.save_gdf: Target path: {file_path}")
cache_logger.info(f"Target path: {file_path}")

if file_type == FileType.PARQUET:
print(
f" FileManager.save_gdf: Writing parquet file ({len(gdf)} rows, {len(gdf.columns)} columns)"
cache_logger.info(
f"Writing parquet file ({len(gdf)} rows, {len(gdf.columns)} columns)"
)
parquet_start = time.time()
gdf.to_parquet(file_path, index=False)
parquet_time = time.time() - parquet_start
print(f" FileManager.save_gdf: Parquet write took {parquet_time:.2f}s")
cache_logger.info(f"Parquet write took {parquet_time:.2f}s")
elif file_type == FileType.GEOJSON:
print(" FileManager.save_gdf: Writing GeoJSON file")
cache_logger.info("Writing GeoJSON file")
geojson_start = time.time()
gdf.to_file(file_path, driver="GeoJSON")
geojson_time = time.time() - geojson_start
print(f" FileManager.save_gdf: GeoJSON write took {geojson_time:.2f}s")
cache_logger.info(f"GeoJSON write took {geojson_time:.2f}s")
elif file_type == FileType.CSV:
print(" FileManager.save_gdf: Writing CSV file")
cache_logger.info("Writing CSV file")
csv_start = time.time()
gdf.to_csv(file_path)
csv_time = time.time() - csv_start
print(f" FileManager.save_gdf: CSV write took {csv_time:.2f}s")
cache_logger.info(f"CSV write took {csv_time:.2f}s")
else:
raise ValueError(f"Unsupported file type: {file_type}")

total_time = time.time() - start_time
print(f" FileManager.save_gdf: Total save operation took {total_time:.2f}s")
cache_logger.info(f"Total save operation took {total_time:.2f}s")

def save_fractional_gdf(
self,
Expand Down Expand Up @@ -315,3 +319,13 @@ def extract_all(self, buffer: BytesIO) -> None:
destination = self.temp_directory
with zipfile.ZipFile(buffer) as zip_ref:
zip_ref.extractall(destination)

def get_cache_file_path(self, cache_key: str) -> Path:
"""
Get the path for a cache file.
"""
cache_logger = get_logger("cache")
cache_dir = ROOT_DIRECTORY / "storage" / "cache"
cache_file = cache_dir / f"{cache_key}.parquet"
cache_logger.info(f"Cache file path: {cache_file}")
return cache_file
Loading
Loading