CodeForPhilly · nlebovits · Jun 29, 2025 · Jun 21, 2025 · Jun 21, 2025 · Jun 21, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+backup_data/ filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,6 @@ data/src/app/service-account-key.json
 
 # Cached and temporary data files from pipeline
 storage/
+
+# cursor
+.cursor/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -23,7 +23,15 @@ repos:
           echo "All MI grades are B or above."'
         language: system
         files: \.py$
-        pass_filenames: true
+        pass_filenames:
+          true
+          # Code similarity detection
+      - id: pylint-similarities
+        name: Code similarity detection (non-blocking)
+        entry: bash -c 'echo "Checking for code similarities in data/src/..."; pylint --disable=all --enable=similarities --score=no data/src/ || true'
+        language: system
+        files: ^data/src/.*\.py$
+        pass_filenames: false
   - repo: https://github.com/jendrikseipp/vulture
     rev: 'v2.3'
     hooks:

diff --git a/data/.env.example b/data/.env.example
@@ -0,0 +1,10 @@
+# Environment variables for Clean and Green Philly backend
+# Copy this file to data/.env and fill in your actual values
+
+# Google Cloud Platform credentials
+CLEAN_GREEN_GOOGLE_KEY=your-api-key-here
+GOOGLE_CLOUD_BUCKET_NAME=your-bucket-name-here
+GOOGLE_CLOUD_PROJECT=your-project-id
+
+# Slack integration for diff reporting
+CAGP_SLACK_API_TOKEN=your-slack-token 
diff --git a/data/backup_data/README.md b/data/backup_data/README.md
@@ -0,0 +1,15 @@
+## Backup Data
+
+This folder contains 1) backup vacant properties data from June 2024, the last time these data were reasonably accurate; and 2) the final outputs of our pipeline before project shutdown in July of 2025.
+
+### Vacancy Data
+
+The land backup file contains vacant land data sent to us by the Department of Licenses and Inspections, corresponding to the last reasonably complete dataset on vacant land prior to the City [no longer collecting vacany data](https://www.inquirer.com/opinion/commentary/mayor-parker-housing-plan-missing-data-20250625.html).
+
+The buildings backup are data that we collected ourselves in June of 2024. They are likely missing about a thousand or more buildings, as we hadn't realized at the time that the buildings dataset was corrupted, too, but they are the best data we have available under the circumstances.
+
+Combined, these represent about 34,000 properties. The pipeline is configured to run using these backup data unless the City's APIs suddenly start returning data above the expected threshold again.
+
+### Pipeline Outputs
+
+As it's currently confiured, the pipeline will return new data for everything _except_ the vacant properties themselves, for which it uses our June of 2024 backups. This means that all associated data are the currently-available numbers from their corresponding services, but we have no way to update the vacant properties data themselves. We have stored these here in both GeoParquet format (representing all 580,000+ properties in Philadelphia) and the PMtiles that we use to visualize vacant properties on the website (representing ~34,000 vacant properties from June of 2024, with the rest of the data from July of 2025).
diff --git a/data/backup_data/buildings_backup_2024_06_24.parquet b/data/backup_data/buildings_backup_2024_06_24.parquet
diff --git a/data/backup_data/land_backup_2024_06_24.parquet b/data/backup_data/land_backup_2024_06_24.parquet
diff --git a/data/docker-compose.yml b/data/docker-compose.yml
@@ -13,6 +13,7 @@ services:
       - CAGP_SLACK_API_TOKEN
     volumes:
       - ./src:/app/src
+      - ./backup_data:/app/backup_data
       - ~/.config/gcloud/application_default_credentials.json:/app/service-account-key.json
       - /etc/timezone:/etc/timezone:ro
       - /etc/localtime:/etc/localtime:ro

diff --git a/data/pyproject.toml b/data/pyproject.toml
@@ -18,8 +18,10 @@ dependencies = [
     "networkx~=3.4.2",
     "pandas==2.2.2",
     "pandera~=0.24.0",
+    "pre-commit>=4.2.0",
     "pyarrow~=18.1.0",
     "pydantic==2.8.2",
+    "pylint>=3.3.7",
     "rasterio~=1.4.3",
     "requests~=2.32.3",
     "scikit-learn~=1.6.0",
@@ -38,7 +40,7 @@ dev = [
     "pytest~=8.3.4",
     "vulture~=2.14",
     "radon~=6.0.1",
-    "ruff~=0.8.2",
+    "ruff~=0.12.0",
 ]
 
 [tool.mypy]

diff --git a/data/src/classes/file_manager.py b/data/src/classes/file_manager.py
@@ -5,12 +5,13 @@
 from datetime import datetime
 from enum import Enum
 from io import BytesIO
+from pathlib import Path
 from typing import List
 
 import geopandas as gpd
 from tqdm import tqdm
 
-from src.config.config import CACHE_FRACTION, ROOT_DIRECTORY
+from src.config.config import CACHE_FRACTION, ROOT_DIRECTORY, get_logger
 
 print(f"Root directory is {ROOT_DIRECTORY}")
 
@@ -120,10 +121,9 @@ def check_source_cache_file_exists(
             table_name (str): The name of the table of source data.
             load_type (LoadType): The destination type of the file (either SOURCE_CACHE or PIPELINE_CACHE).
         """
+        cache_logger = get_logger("cache")
         start_time = time.time()
-        print(
-            f"    FileManager.check_source_cache_file_exists: Checking for {table_name}"
-        )
+        cache_logger.info(f"Checking for {table_name}")
 
         directory = (
             self.source_cache_directory
@@ -140,8 +140,8 @@ def check_source_cache_file_exists(
         result = len(files) > 0
         total_time = time.time() - start_time
 
-        print(
-            f"    FileManager.check_source_cache_file_exists: Found {len(files)} files in {glob_time:.2f}s (total: {total_time:.2f}s)"
+        cache_logger.info(
+            f"Found {len(files)} files in {glob_time:.2f}s (total: {total_time:.2f}s)"
         )
         return result
 
@@ -154,10 +154,9 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
             GeoDataFrame: The dataframe loaded from the most recent cached file.
             None: If no files exist for the given table name.
         """
+        cache_logger = get_logger("cache")
         start_time = time.time()
-        print(
-            f"    FileManager.get_most_recent_cache: Loading most recent cache for {table_name}"
-        )
+        cache_logger.info(f"Loading most recent cache for {table_name}")
 
         # Use glob pattern matching for more efficient file searching
         pattern = os.path.join(self.source_cache_directory, f"*{table_name}*.parquet")
@@ -167,19 +166,19 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
         glob_time = time.time() - glob_start
 
         if not cached_files:
-            print("    FileManager.get_most_recent_cache: No cached files found")
+            cache_logger.info("No cached files found")
             return None
 
         # Get the most recent file by modification time
         mtime_start = time.time()
         most_recent_file = max(cached_files, key=os.path.getmtime)
         mtime_time = time.time() - mtime_start
 
-        print(
-            f"    FileManager.get_most_recent_cache: Found {len(cached_files)} files, most recent: {os.path.basename(most_recent_file)}"
+        cache_logger.info(
+            f"Found {len(cached_files)} files, most recent: {os.path.basename(most_recent_file)}"
         )
-        print(
-            f"    FileManager.get_most_recent_cache: Glob took {glob_time:.2f}s, mtime check took {mtime_time:.2f}s"
+        cache_logger.info(
+            f"Glob took {glob_time:.2f}s, mtime check took {mtime_time:.2f}s"
         )
 
         # Load the parquet file
@@ -188,8 +187,8 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
         load_time = time.time() - load_start
 
         total_time = time.time() - start_time
-        print(
-            f"    FileManager.get_most_recent_cache: Parquet load took {load_time:.2f}s (total: {total_time:.2f}s)"
+        cache_logger.info(
+            f"Parquet load took {load_time:.2f}s (total: {total_time:.2f}s)"
         )
 
         return gdf
@@ -205,7 +204,10 @@ def load_gdf(
             file_type (FileType): The type of the file (GEOJSON or PARQUET).
             load_type (LoadType): The destination type of the file (TEMP or CACHE).
         """
+        cache_logger = get_logger("cache")
         file_path = self.get_file_path(file_name, load_type, file_type)
+        cache_logger.info(f"Loading {file_name} from {file_path}")
+
         if os.path.exists(file_path):
             gdf = (
                 gpd.read_parquet(file_path)
@@ -234,37 +236,39 @@ def save_gdf(
             file_type (FileType): The type of the file (GEOJSON or PARQUET).
             load_type (LoadType): The destination type of the file (TEMP or CACHE).
         """
+        cache_logger = get_logger("cache")
+        cache_logger.info(f"Saving {file_name} to {load_type.value}/{file_type.value}")
         start_time = time.time()
-        print(f"    FileManager.save_gdf: Starting save for {file_name}")
+        cache_logger.info(f"Starting save for {file_name}")
 
         file_path = self.get_file_path(file_name, load_type, file_type)
-        print(f"    FileManager.save_gdf: Target path: {file_path}")
+        cache_logger.info(f"Target path: {file_path}")
 
         if file_type == FileType.PARQUET:
-            print(
-                f"    FileManager.save_gdf: Writing parquet file ({len(gdf)} rows, {len(gdf.columns)} columns)"
+            cache_logger.info(
+                f"Writing parquet file ({len(gdf)} rows, {len(gdf.columns)} columns)"
             )
             parquet_start = time.time()
             gdf.to_parquet(file_path, index=False)
             parquet_time = time.time() - parquet_start
-            print(f"    FileManager.save_gdf: Parquet write took {parquet_time:.2f}s")
+            cache_logger.info(f"Parquet write took {parquet_time:.2f}s")
         elif file_type == FileType.GEOJSON:
-            print("    FileManager.save_gdf: Writing GeoJSON file")
+            cache_logger.info("Writing GeoJSON file")
             geojson_start = time.time()
             gdf.to_file(file_path, driver="GeoJSON")
             geojson_time = time.time() - geojson_start
-            print(f"    FileManager.save_gdf: GeoJSON write took {geojson_time:.2f}s")
+            cache_logger.info(f"GeoJSON write took {geojson_time:.2f}s")
         elif file_type == FileType.CSV:
-            print("    FileManager.save_gdf: Writing CSV file")
+            cache_logger.info("Writing CSV file")
             csv_start = time.time()
             gdf.to_csv(file_path)
             csv_time = time.time() - csv_start
-            print(f"    FileManager.save_gdf: CSV write took {csv_time:.2f}s")
+            cache_logger.info(f"CSV write took {csv_time:.2f}s")
         else:
             raise ValueError(f"Unsupported file type: {file_type}")
 
         total_time = time.time() - start_time
-        print(f"    FileManager.save_gdf: Total save operation took {total_time:.2f}s")
+        cache_logger.info(f"Total save operation took {total_time:.2f}s")
 
     def save_fractional_gdf(
         self,
@@ -315,3 +319,13 @@ def extract_all(self, buffer: BytesIO) -> None:
         destination = self.temp_directory
         with zipfile.ZipFile(buffer) as zip_ref:
             zip_ref.extractall(destination)
+
+    def get_cache_file_path(self, cache_key: str) -> Path:
+        """
+        Get the path for a cache file.
+        """
+        cache_logger = get_logger("cache")
+        cache_dir = ROOT_DIRECTORY / "storage" / "cache"
+        cache_file = cache_dir / f"{cache_key}.parquet"
+        cache_logger.info(f"Cache file path: {cache_file}")
+        return cache_file
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		backup_data/ filter=lfs diff=lfs merge=lfs -text