CodeForPhilly · cfreedman · Jun 10, 2025 · Jun 8, 2025
diff --git a/.github/workflows/pr_checks_backend.yml b/.github/workflows/pr_checks_backend.yml
@@ -6,8 +6,6 @@ on:
       - staging
     paths:
       - 'data/**'
-      - 'Dockerfile-pg'
-      - 'init_pg.sql'
       - 'docker compose.yml'
   workflow_dispatch:
 
@@ -35,23 +33,6 @@ jobs:
     defaults:
       run:
         working-directory: data/src
-    env:
-      VACANT_LOTS_DB: 'postgresql://postgres:temp-CI-only@localhost:5433/vacantlotdb'
-    services:
-      postgres:
-        image: postgis/postgis:16-3.4
-        env:
-          POSTGRES_USER: postgres
-          POSTGRES_PASSWORD: temp-CI-only # CI-only, safe to hardcode for temporary container
-          POSTGRES_DB: vacantlotdb
-        ports:
-          - 5433:5432
-        # Set health checks to wait until postgres is ready
-        options: >-
-          --health-cmd pg_isready
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 5
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4

diff --git a/data/Dockerfile b/data/Dockerfile
@@ -15,13 +15,6 @@ RUN apt-get update && apt-get install -y \
     lsb-release \
     && rm -rf /var/lib/apt/lists/*
 
-# install postgres client 16 for psql and pg_dump executables for backups.
-# should match the version used in the other docker file for the postgres install
-RUN sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
-RUN curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc |  gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg
-RUN apt update
-RUN apt install -y postgresql-client-16
-
 # Set GDAL environment variables
 ENV GDAL_VERSION=3.6.2
 ENV GDAL_CONFIG=/usr/bin/gdal-config
@@ -52,4 +45,3 @@ COPY src ./src
 RUN ls -a /usr/src/app
 
 CMD ["pipenv", "run", "python", "-m", "src.main.py"]
-
diff --git a/data/Pipfile b/data/Pipfile
@@ -8,7 +8,6 @@ awkde = { git = 'https://github.com/mennthor/awkde.git', ref = '5b601fe4d92229d5
 esridump = "~=1.13.0"
 fiona = "~=1.10.1"
 future = "~=1.0.0"
-geoalchemy2 = "~=0.16.0"
 geopandas = "==1.0.1"
 google-cloud-storage = "~=2.19.0"
 jenkspy = "~=0.4.1"
@@ -19,15 +18,13 @@ mapclassify = "~=2.8.1"
 matplotlib = "~=3.9.3"
 networkx = "~=3.4.2"
 pandas = "==2.2.2"
-psycopg2-binary = "~=2.9.10"
 pyarrow = "~=18.1.0"
 pydantic = "==2.8.2"
 rasterio = "~=1.4.3"
 requests = "~=2.32.3"
 scikit-learn = "~=1.6.0"
 shapely = "~=2.0.6"
 slack-sdk = "~=3.33.5"
-sqlalchemy = "~=2.0.36"
 tqdm = "~=4.67.1"
 
 

diff --git a/data/Pipfile.lock b/data/Pipfile.lock
diff --git a/data/docker-compose.yml b/data/docker-compose.yml
@@ -7,8 +7,6 @@ services:
     environment:
       - GOOGLE_APPLICATION_CREDENTIALS=/app/service-account-key.json
       - VACANT_LOTS_DB
-      - POSTGRES_PASSWORD
-      - POSTGRES_PORT=5434
       - CLEAN_GREEN_GOOGLE_KEY
       - PYTHONUNBUFFERED=1
       - GOOGLE_CLOUD_BUCKET_NAME

diff --git a/data/src/classes/data_diff.py b/data/src/classes/data_diff.py
@@ -1,6 +1,6 @@
-from datetime import datetime
 import os
 import re
+from datetime import datetime
 
 from src.classes.file_manager import FileManager, FileType, LoadType
 
@@ -13,7 +13,6 @@ def __init__(self, table_name="all_properties_end", unique_id_col="opa_id"):
         Initialize the DiffReport.
 
         Args:
-            conn: SQLAlchemy connection to the database.
             table_name (str): The name of the table to analyze.
             unique_id_col (str): Column used as a unique identifier.
         """

diff --git a/data/src/classes/slack_reporters.py b/data/src/classes/slack_reporters.py
@@ -86,10 +86,6 @@ def send_parquet_stats_to_slack(self, table_names: List[str]):
         """
         Report total sizes for all hypertables using hypertable_detailed_size
         and send the result to a Slack channel.
-
-        Args:
-            conn: SQLAlchemy connection to the PostgreSQL database.
-            slack_token (str): The Slack API token. If not provided, it will be read from the environment.
         """
 
         detailed_sizes = []

diff --git a/data/src/config/config.py b/data/src/config/config.py
@@ -2,10 +2,8 @@
 from pathlib import Path
 
 FORCE_RELOAD = True
-""" During the data load, whether to query the various GIS API services for the data to load into the postgres tables.  If True, will query the API services, backup the database, reload the database and report on data differences.  If false will read the data from postgres."""
-
-BACKUP_SCHEMA = False
-""" Whether to backup the database schema before loading the data in script.py.  """
+""" During the data load, whether to query the various GIS API services for the data to load. If True, will query the
+API services and report on data differences.  If false will read the cached data."""
 
 USE_CRS = "EPSG:2272"
 """ the standard geospatial code for Pennsylvania South (ftUS) """
@@ -19,9 +17,6 @@
 log_level: int = logging.WARN
 """ overall log level for the project """
 
-max_backup_schema_days: int = 365
-""" max days to keep backed up schemas archived in psql """
-
 report_to_slack_channel: str = ""
 """ if this is not blank, send the data-diff summary report to this Slack channel.
 The CAGP_SLACK_API_TOKEN environment variable must be set """
@@ -39,14 +34,14 @@
 """ the prefix of the name of the tiles file generated and saved to GCP """
 
 write_production_tiles_file: bool = False
-""" Whether to write the main vacant_properties_tiles.pmtiles as well as the staging vacant_properties_tiles_staging.pmtiles. 
+""" Whether to write the main vacant_properties_tiles.pmtiles as well as the staging vacant_properties_tiles_staging.pmtiles.
 BE CAREFUL, if true this writes the production file.
 """
 tile_file_backup_directory: str = "backup"
 """ The name of the directory in GCP to store timestamped backups of the tiles file """
 
 min_tiles_file_size_in_bytes: int = 5 * 1024 * 1024
-""" The minimum file size in bytes of the final generated pm tiles file.  If the file is not at least this size, 
+""" The minimum file size in bytes of the final generated pm tiles file.  If the file is not at least this size,
 don't upload to the GCP bucket as the file may be corrupted, e.g. a source vacant properties dataset was incomplete with not enough features."""
 
 

diff --git a/data/src/config/psql.py b/data/src/config/psql.py
diff --git a/data/src/data_utils/utils.py b/data/src/data_utils/utils.py
@@ -1,21 +1,8 @@
 import os
-import re
 
 import requests
 
 
-def mask_password(value: str):
-    """remove the password from this postgresql connect string so we don't write it to logs, etc.
-
-    Args:
-        value (str): the unmasked string containing one or more postgres connect string.
-
-    Returns:
-        _type_: the string with the password replaced by MASKED
-    """
-    return re.sub(":\w+@", ":MASKED@", value)
-
-
 def save_stream_url(url: str) -> str:
     """download the file from this url to the tmp/ directory by streaming in a memory-friendly way.
     If local file already exists, use it and don't download.

diff --git a/docs/SETUP/BACK_END.md b/docs/SETUP/BACK_END.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-If you plan to contribute to the data wrangling and database management on this project and need to run the Python script, follow the installation and setup instructions below.
+If you plan to contribute to the data wrangling on this project and need to run the Python script, follow the installation and setup instructions below.
 
 ## Setup
 
@@ -54,10 +54,7 @@ The project requires specific and sensitive information to run, which should be
 1. Create a file named `.env` in the `/data` subdirectory of your project.
 2. Add the following environment variables to the `.env` file:
 
-```sh
-POSTGRES_PASSWORD=a-strong-password-here
-VACANT_LOTS_DB=postgresql://postgres:${POSTGRES_PASSWORD}@localhost:5433/vacantlotdb
-```
+<!-- TODO: What env vars do we need? -->
 
 All local environment variables will be passed through to docker compose, so if you have them set up in the `.env` file, you should not need to hard-code them elsewhere.
 
@@ -67,39 +64,23 @@ For Mac and Linux, you can permanently store the environment variables in your c
 
 For Windows, you can set environment variables under System -> Advanced or you can download a terminal emulator such as [Git Bash](https://gitforwindows.org/) and follow the instructions for Mac and Linux above. A terminal emulator is recommended.
 
-```sh
-export POSTGRES_PASSWORD=a-strong-password-here
-export VACANT_LOTS_DB=postgresql://postgres:${POSTGRES_PASSWORD}@localhost:5433/vacantlotdb
-```
-
 All of your local environment variables will be passed through to docker compose, so if you have them locally, you should not have to hard-code them.
 
 ### Docker Build
 
 Docker is a platform that allows you to containerize and run applications in isolated environments, making it easier to manage dependencies and ensure consistent deployments. Download the [latest version of Docker Desktop for your operating system](https://www.docker.com/products/docker-desktop/).
 
-We use [docker compose](https://docs.docker.com/compose/) to manage the backend Docker services. The `data/docker-compose.yaml` file defines the services. The only service that runs perpetually in Docker is `postgres`. The other services are one-time batch jobs to build the data sets.
+We use [docker compose](https://docs.docker.com/compose/) to manage the backend Docker services. The `data/docker-compose.yaml` file defines the services.
 
-1. The first time you set up your backend, or any time either of the two Docker files change, build the Docker services by running:
+1. The first time you set up your backend, or any the Docker file changes, build the Docker services by running:
 
    ```sh
    docker compose build
    ```
 
-   This should correctly build both containers. However, if it does not, you can explicitly build the postgres container with the following:
-
-   ```sh
-   docker compose build postgres
-   ```
-
-2. When both containers are built, connect to the PG database in the container by running:
-   ```sh
-   docker compose up -d postgres
-   ```
-
 For first-time runs, set `FORCE_RELOAD=True` in `config.py` and optionally `log_level: int = logging.DEBUG` to get more verbose output.
 
-All Docker commands should be run from the `data/` directory. There is one main `Dockerfile` for the batch scripts and one called `Dockerfile-pg` for the PostgreSQL and postgis installation. There is also a file called `init_pg.sql` that is run one time by Docker when the postgres data volume is empty to create the database and install postgis. You should not have to touch any of the above three files.
+All Docker commands should be run from the `data/` directory.
 
 #### Windows
 
@@ -121,55 +102,13 @@ The backend also works on WSL Ubuntu running Docker for Linux on Windows 10.
 
 In the terminal, use the `cd` command to navigate to your repository location, and then into the `data` directory. Run `docker compose run vacant-lots-proj`. This command starts Docker Compose and sets up your environment as defined in your `docker-compose.yml` file. When you're finished and want to shut down the Docker containers, run `docker compose down`.
 
-### PostgreSQL
-
-[PostgreSQL](https://www.postgresql.org/) AKA postgres, pg, psql is an open-source relational database management system. It is used in this project only by the data load script to stage data and by the data diff process to compare new data with backed up data. It is not needed by the front-end to run. We run Postgres with the [Postgis](https://postgis.net/) extension for geospatial data in a Docker container.
-
-We are running postgres on the non-standard port 5433 instead of the default of 5432. This is so our Docker postgres will not conflict with any native postgres already running on the developer's PC.
-
-To start the postgres Docker container, run:
-
-```sh
-docker compose up -d postgres
-```
-
-You can access the psql command line in your container to work with the database with this command:
-
-```sh
-docker exec -it cagp-postgres psql -U postgres -d vacantlotdb
-```
-
-To stop the postgres container run:
-
-```sh
-docker compose down postgres
-```
-
-### PostgreSQL Extensions
-
-We use Postgres extensions for GIS and time series functionality not included in base Postgres.
-
-#### PostGIS
-
-[PostGIS](https://postgis.net/) is an open-source extension for PostgreSQL that adds support for spatial and geographic data types and functions. It enables the storage, querying, and analysis of location-based data directly within the database, replacing the need for many external tools and libraries.
-
-#### Timescale DB
-
-[TimescaleDB](https://docs.timescale.com/) is an open-source relational database built on PostgreSQL, optimized for handling time-series data efficiently.
-
-At the core of TimescaleDB are hypertables, which partition data across time for efficient querying. Hypertables behave like normal Postgres tables, but are optimized for querying data based on timestamps. For our use case, hypertables simplify data management by automatically creating monthly partitions, replacing our previous method of manually creating a separate schema for each month.
-
-#### pg_stat_statements
-
-The [pg_stat_statements](https://www.postgresql.org/docs/current/pgstatstatements.html) extension provides detailed statistics on query performance, helping to identify slow or resource-intensive queries. It tracks execution counts, execution times, and rows returned, making it a useful tool for analyzing slow or problematic queries.
-
 ## Python Development
 
 You can set up your local Python environment so you can develop and run the backend `script.py` and create and run unit tests outside of Docker. Build your local environment to match what is defined in the `Dockerfile`. Install the same python version as is in the Dockerfile, using `pyenv` to manage multiple distributions if needed. Use `pipenv` to create a virtual environment. Install the pip dependencies that are defined in the `Pipfile` into your virtual environment. Install the executables with `apt-get`. Now you can develop in Python in your terminal and IDE and run unit tests with `pytest`.
 
 ## Configuration
 
-There are numerous configuration variables in `data/src/config/config.py`. See the documentation in that file for each variable. You will also have to set up environmental variables for keys and database connection parameters as defined throughout this document.
+There are numerous configuration variables in `data/src/config/config.py`. See the documentation in that file for each variable. You will also have to set up environmental variables as defined throughout this document.
 
 There are the following secrets that may be securely shared with you by the project leads:
 
@@ -201,8 +140,6 @@ You can run the tile build locally with `docker compose run vacant-lots-proj` to
 Your `/data/.env` file should now look like this:
 
 ```sh
-POSTGRES_PASSWORD=a-strong-password-here
-VACANT_LOTS_DB=postgresql://postgres:${POSTGRES_PASSWORD}@localhost:5433/vacantlotdb
 CLEAN_GREEN_GOOGLE_KEY=your-api-key-here
 GOOGLE_CLOUD_BUCKET_NAME=your-bucket-name-here
 ```
@@ -227,8 +164,6 @@ The script should only load new images that aren't in the bucket already (new pr
 
 Whenever the data load script is run in force reload mode, the old data set is backed up and a report of any differences is sent to the team via Slack. Differences in data are calculated using the [data-diff](https://github.com/datafold/data-diff) package. See [issue 520](https://github.com/CodeForPhilly/clean-and-green-philly/issues/520) in Github.
 
-Backups are done in PostgreSQL in the vacantlotsdb database by copying the whole public schema to a backup schema named backup\_{timestamp}. Besides the original tables, the backup schema includes a '{table_name}\_diff' table with details of the differences from data-diff for each table.
-
 Backup schemas are only kept for one year by default. Backup schemas older than a year are deleted at the end of the load script.
 
 After all runs of the back-end script, the tiles file is backed up to the backup/ directory in the GCP bucket with a timestamp. If the main tiles file ever gets corrupted, it can be rolled back to a backup file.

diff --git a/docs/TECHNOLOGIES.md b/docs/TECHNOLOGIES.md
@@ -59,7 +59,7 @@ We use a variety of file types for storage of the end dataset constructed by the
 
 The first are PMtiles, which is a single-file vector tile format for geospatial data. It's main benefit is that it, as a single file, encompasses all vector tiles are each zoom level for our data, and so it can be hosted in our Google Cloud Bucket and simply fetched from by Maplibre rather than needing a more complex backend or querying solution as an intermediary. More information can be found [here](https://docs.protomaps.com/pmtiles/).
 
-The second are GeoParquet files, which are a geospatial addition to Apache Parquet files. Their benefit is they are a highly performant way to compress and store columnar data, and they have useful mechanisms for self-describing and partitioning the data within the file itself for faster retrieval and querying in the future. We are in the process of shifting the majority of our storage into this format and phasing out the Postgres-based service we have previously been using, which is mentioned below. You can find out more [here](https://geoparquet.org/releases/v0.2.0/).
+The second are GeoParquet files, which are a geospatial addition to Apache Parquet files. Their benefit is they are a highly performant way to compress and store columnar data, and they have useful mechanisms for self-describing and partitioning the data within the file itself for faster retrieval and querying in the future. You can find out more [here](https://geoparquet.org/releases/v0.2.0/).
 
 ### Python version and dependency management - Pipenv - Pyenv