Skip to content

Commit a449d1f

Browse files
authored
Merge pull request #1251 from CodeForPhilly/lebovits/issu1225-stats-validators
Lebovits/issu1225 stats validators
2 parents 331c6c5 + 46c9d85 commit a449d1f

File tree

100 files changed

+14618
-1896
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

100 files changed

+14618
-1896
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
backup_data/ filter=lfs diff=lfs merge=lfs -text

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,6 @@ data/src/app/service-account-key.json
6565

6666
# Cached and temporary data files from pipeline
6767
storage/
68+
69+
# cursor
70+
.cursor/

.pre-commit-config.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,15 @@ repos:
2323
echo "All MI grades are B or above."'
2424
language: system
2525
files: \.py$
26-
pass_filenames: true
26+
pass_filenames:
27+
true
28+
# Code similarity detection
29+
- id: pylint-similarities
30+
name: Code similarity detection (non-blocking)
31+
entry: bash -c 'echo "Checking for code similarities in data/src/..."; pylint --disable=all --enable=similarities --score=no data/src/ || true'
32+
language: system
33+
files: ^data/src/.*\.py$
34+
pass_filenames: false
2735
- repo: https://github.com/jendrikseipp/vulture
2836
rev: 'v2.3'
2937
hooks:

data/.env.example

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Environment variables for Clean and Green Philly backend
2+
# Copy this file to data/.env and fill in your actual values
3+
4+
# Google Cloud Platform credentials
5+
CLEAN_GREEN_GOOGLE_KEY=your-api-key-here
6+
GOOGLE_CLOUD_BUCKET_NAME=your-bucket-name-here
7+
GOOGLE_CLOUD_PROJECT=your-project-id
8+
9+
# Slack integration for diff reporting
10+
CAGP_SLACK_API_TOKEN=your-slack-token

data/backup_data/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
## Backup Data
2+
3+
This folder contains 1) backup vacant properties data from June 2024, the last time these data were reasonably accurate; and 2) the final outputs of our pipeline before project shutdown in July of 2025.
4+
5+
### Vacancy Data
6+
7+
The land backup file contains vacant land data sent to us by the Department of Licenses and Inspections, corresponding to the last reasonably complete dataset on vacant land prior to the City [no longer collecting vacany data](https://www.inquirer.com/opinion/commentary/mayor-parker-housing-plan-missing-data-20250625.html).
8+
9+
The buildings backup are data that we collected ourselves in June of 2024. They are likely missing about a thousand or more buildings, as we hadn't realized at the time that the buildings dataset was corrupted, too, but they are the best data we have available under the circumstances.
10+
11+
Combined, these represent about 34,000 properties. The pipeline is configured to run using these backup data unless the City's APIs suddenly start returning data above the expected threshold again.
12+
13+
### Pipeline Outputs
14+
15+
As it's currently confiured, the pipeline will return new data for everything _except_ the vacant properties themselves, for which it uses our June of 2024 backups. This means that all associated data are the currently-available numbers from their corresponding services, but we have no way to update the vacant properties data themselves. We have stored these here in both GeoParquet format (representing all 580,000+ properties in Philadelphia) and the PMtiles that we use to visualize vacant properties on the website (representing ~34,000 vacant properties from June of 2024, with the rest of the data from July of 2025).
628 KB
Binary file not shown.
3.03 MB
Binary file not shown.

data/docker-compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ services:
1313
- CAGP_SLACK_API_TOKEN
1414
volumes:
1515
- ./src:/app/src
16+
- ./backup_data:/app/backup_data
1617
- ~/.config/gcloud/application_default_credentials.json:/app/service-account-key.json
1718
- /etc/timezone:/etc/timezone:ro
1819
- /etc/localtime:/etc/localtime:ro

data/pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@ dependencies = [
1818
"networkx~=3.4.2",
1919
"pandas==2.2.2",
2020
"pandera~=0.24.0",
21+
"pre-commit>=4.2.0",
2122
"pyarrow~=18.1.0",
2223
"pydantic==2.8.2",
24+
"pylint>=3.3.7",
2325
"rasterio~=1.4.3",
2426
"requests~=2.32.3",
2527
"scikit-learn~=1.6.0",
@@ -38,7 +40,7 @@ dev = [
3840
"pytest~=8.3.4",
3941
"vulture~=2.14",
4042
"radon~=6.0.1",
41-
"ruff~=0.8.2",
43+
"ruff~=0.12.0",
4244
]
4345

4446
[tool.mypy]

data/src/classes/file_manager.py

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55
from datetime import datetime
66
from enum import Enum
77
from io import BytesIO
8+
from pathlib import Path
89
from typing import List
910

1011
import geopandas as gpd
1112
from tqdm import tqdm
1213

13-
from src.config.config import CACHE_FRACTION, ROOT_DIRECTORY
14+
from src.config.config import CACHE_FRACTION, ROOT_DIRECTORY, get_logger
1415

1516
print(f"Root directory is {ROOT_DIRECTORY}")
1617

@@ -120,10 +121,9 @@ def check_source_cache_file_exists(
120121
table_name (str): The name of the table of source data.
121122
load_type (LoadType): The destination type of the file (either SOURCE_CACHE or PIPELINE_CACHE).
122123
"""
124+
cache_logger = get_logger("cache")
123125
start_time = time.time()
124-
print(
125-
f" FileManager.check_source_cache_file_exists: Checking for {table_name}"
126-
)
126+
cache_logger.info(f"Checking for {table_name}")
127127

128128
directory = (
129129
self.source_cache_directory
@@ -140,8 +140,8 @@ def check_source_cache_file_exists(
140140
result = len(files) > 0
141141
total_time = time.time() - start_time
142142

143-
print(
144-
f" FileManager.check_source_cache_file_exists: Found {len(files)} files in {glob_time:.2f}s (total: {total_time:.2f}s)"
143+
cache_logger.info(
144+
f"Found {len(files)} files in {glob_time:.2f}s (total: {total_time:.2f}s)"
145145
)
146146
return result
147147

@@ -154,10 +154,9 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
154154
GeoDataFrame: The dataframe loaded from the most recent cached file.
155155
None: If no files exist for the given table name.
156156
"""
157+
cache_logger = get_logger("cache")
157158
start_time = time.time()
158-
print(
159-
f" FileManager.get_most_recent_cache: Loading most recent cache for {table_name}"
160-
)
159+
cache_logger.info(f"Loading most recent cache for {table_name}")
161160

162161
# Use glob pattern matching for more efficient file searching
163162
pattern = os.path.join(self.source_cache_directory, f"*{table_name}*.parquet")
@@ -167,19 +166,19 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
167166
glob_time = time.time() - glob_start
168167

169168
if not cached_files:
170-
print(" FileManager.get_most_recent_cache: No cached files found")
169+
cache_logger.info("No cached files found")
171170
return None
172171

173172
# Get the most recent file by modification time
174173
mtime_start = time.time()
175174
most_recent_file = max(cached_files, key=os.path.getmtime)
176175
mtime_time = time.time() - mtime_start
177176

178-
print(
179-
f" FileManager.get_most_recent_cache: Found {len(cached_files)} files, most recent: {os.path.basename(most_recent_file)}"
177+
cache_logger.info(
178+
f"Found {len(cached_files)} files, most recent: {os.path.basename(most_recent_file)}"
180179
)
181-
print(
182-
f" FileManager.get_most_recent_cache: Glob took {glob_time:.2f}s, mtime check took {mtime_time:.2f}s"
180+
cache_logger.info(
181+
f"Glob took {glob_time:.2f}s, mtime check took {mtime_time:.2f}s"
183182
)
184183

185184
# Load the parquet file
@@ -188,8 +187,8 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
188187
load_time = time.time() - load_start
189188

190189
total_time = time.time() - start_time
191-
print(
192-
f" FileManager.get_most_recent_cache: Parquet load took {load_time:.2f}s (total: {total_time:.2f}s)"
190+
cache_logger.info(
191+
f"Parquet load took {load_time:.2f}s (total: {total_time:.2f}s)"
193192
)
194193

195194
return gdf
@@ -205,7 +204,10 @@ def load_gdf(
205204
file_type (FileType): The type of the file (GEOJSON or PARQUET).
206205
load_type (LoadType): The destination type of the file (TEMP or CACHE).
207206
"""
207+
cache_logger = get_logger("cache")
208208
file_path = self.get_file_path(file_name, load_type, file_type)
209+
cache_logger.info(f"Loading {file_name} from {file_path}")
210+
209211
if os.path.exists(file_path):
210212
gdf = (
211213
gpd.read_parquet(file_path)
@@ -234,37 +236,39 @@ def save_gdf(
234236
file_type (FileType): The type of the file (GEOJSON or PARQUET).
235237
load_type (LoadType): The destination type of the file (TEMP or CACHE).
236238
"""
239+
cache_logger = get_logger("cache")
240+
cache_logger.info(f"Saving {file_name} to {load_type.value}/{file_type.value}")
237241
start_time = time.time()
238-
print(f" FileManager.save_gdf: Starting save for {file_name}")
242+
cache_logger.info(f"Starting save for {file_name}")
239243

240244
file_path = self.get_file_path(file_name, load_type, file_type)
241-
print(f" FileManager.save_gdf: Target path: {file_path}")
245+
cache_logger.info(f"Target path: {file_path}")
242246

243247
if file_type == FileType.PARQUET:
244-
print(
245-
f" FileManager.save_gdf: Writing parquet file ({len(gdf)} rows, {len(gdf.columns)} columns)"
248+
cache_logger.info(
249+
f"Writing parquet file ({len(gdf)} rows, {len(gdf.columns)} columns)"
246250
)
247251
parquet_start = time.time()
248252
gdf.to_parquet(file_path, index=False)
249253
parquet_time = time.time() - parquet_start
250-
print(f" FileManager.save_gdf: Parquet write took {parquet_time:.2f}s")
254+
cache_logger.info(f"Parquet write took {parquet_time:.2f}s")
251255
elif file_type == FileType.GEOJSON:
252-
print(" FileManager.save_gdf: Writing GeoJSON file")
256+
cache_logger.info("Writing GeoJSON file")
253257
geojson_start = time.time()
254258
gdf.to_file(file_path, driver="GeoJSON")
255259
geojson_time = time.time() - geojson_start
256-
print(f" FileManager.save_gdf: GeoJSON write took {geojson_time:.2f}s")
260+
cache_logger.info(f"GeoJSON write took {geojson_time:.2f}s")
257261
elif file_type == FileType.CSV:
258-
print(" FileManager.save_gdf: Writing CSV file")
262+
cache_logger.info("Writing CSV file")
259263
csv_start = time.time()
260264
gdf.to_csv(file_path)
261265
csv_time = time.time() - csv_start
262-
print(f" FileManager.save_gdf: CSV write took {csv_time:.2f}s")
266+
cache_logger.info(f"CSV write took {csv_time:.2f}s")
263267
else:
264268
raise ValueError(f"Unsupported file type: {file_type}")
265269

266270
total_time = time.time() - start_time
267-
print(f" FileManager.save_gdf: Total save operation took {total_time:.2f}s")
271+
cache_logger.info(f"Total save operation took {total_time:.2f}s")
268272

269273
def save_fractional_gdf(
270274
self,
@@ -315,3 +319,13 @@ def extract_all(self, buffer: BytesIO) -> None:
315319
destination = self.temp_directory
316320
with zipfile.ZipFile(buffer) as zip_ref:
317321
zip_ref.extractall(destination)
322+
323+
def get_cache_file_path(self, cache_key: str) -> Path:
324+
"""
325+
Get the path for a cache file.
326+
"""
327+
cache_logger = get_logger("cache")
328+
cache_dir = ROOT_DIRECTORY / "storage" / "cache"
329+
cache_file = cache_dir / f"{cache_key}.parquet"
330+
cache_logger.info(f"Cache file path: {cache_file}")
331+
return cache_file

0 commit comments

Comments
 (0)