Merge pull request #112 from PolicyEngine/nikhilwoodruff/issue111

nikhilwoodruff · web-flow · commit bbcf80259a33 · 2025-04-29T09:07:54.000-04:00
Add GCP dataset downloads
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Google Cloud Storage data downloads.
diff --git a/policyengine/outputs/macro/comparison/calculate_economy_comparison.py b/policyengine/outputs/macro/comparison/calculate_economy_comparison.py
@@ -2,7 +2,7 @@
 
 from microdf import MicroSeries
 import numpy as np
-from policyengine_core.tools.hugging_face import download_huggingface_dataset
+from policyengine.utils.data_download import download
 import pandas as pd
 import h5py
 from pydantic import BaseModel
@@ -709,18 +709,20 @@ def uk_constituency_breakdown(
     baseline_hnet = baseline.household_net_income
     reform_hnet = reform.household_net_income
 
-    constituency_weights_path = download_huggingface_dataset(
-        repo="policyengine/policyengine-uk-data",
-        repo_filename="parliamentary_constituency_weights.h5",
+    constituency_weights_path = download(
+        huggingface_repo="policyengine-uk-data",
+        gcs_bucket="policyengine-uk-data-private",
+        filepath="parliamentary_constituency_weights.h5",
     )
     with h5py.File(constituency_weights_path, "r") as f:
         weights = f["2025"][
             ...
         ]  # {2025: array(650, 100180) where cell i, j is the weight of household record i in constituency j}
 
-    constituency_names_path = download_huggingface_dataset(
-        repo="policyengine/policyengine-uk-data",
-        repo_filename="constituencies_2024.csv",
+    constituency_names_path = download(
+        huggingface_repo="policyengine-uk-data",
+        gcs_bucket="policyengine-uk-data-private",
+        filepath="constituencies_2024.csv",
     )
     constituency_names = pd.read_csv(
         constituency_names_path
diff --git a/policyengine/simulation.py b/policyengine/simulation.py
@@ -1,5 +1,5 @@
 """Simulate tax-benefit policy and derive society-level output statistics."""
- 
+
 from pydantic import BaseModel, Field
 from typing import Literal
 from .constants import DEFAULT_DATASETS_BY_COUNTRY
@@ -10,7 +10,6 @@
 from .utils.reforms import ParametricReform
 from policyengine_core.reforms import Reform as StructuralReform
 from policyengine_core.data import Dataset
-from .utils.huggingface import download
 from policyengine_us import (
     Simulation as USSimulation,
     Microsimulation as USMicrosimulation,
@@ -26,6 +25,7 @@
 from functools import wraps, partial
 from typing import Dict, Any, Callable
 import importlib
+from policyengine.utils.data_download import download
 
 CountryType = Literal["uk", "us"]
 ScopeType = Literal["household", "macro"]
@@ -78,6 +78,7 @@ def __init__(self, **options: SimulationOptions):
                 self.options.country
             ]
 
+        self._set_data()
         self._initialise_simulations()
         self._add_output_functions()
 
@@ -118,7 +119,36 @@ def _set_data(self):
                 self.options.country
             ]
 
-        self._data_handle_cps_special_case()
+        if isinstance(self.options.data, str):
+            filename = self.options.data
+            if "://" in self.options.data:
+                bucket = None
+                hf_repo = None
+                hf_org = None
+                if "gs://" in self.options.data:
+                    bucket, filename = self.options.data.split("://")[
+                        -1
+                    ].split("/")
+                elif "hf://" in self.options.data:
+                    hf_org, hf_repo, filename = self.options.data.split("://")[
+                        -1
+                    ].split("/", 2)
+
+                if not Path(filename).exists():
+                    file_path = download(
+                        filepath=filename,
+                        huggingface_org=hf_org,
+                        huggingface_repo=hf_repo,
+                        gcs_bucket=bucket,
+                    )
+                    filename = str(Path(file_path))
+            if "cps_2023" in filename:
+                time_period = 2023
+            else:
+                time_period = None
+            self.options.data = Dataset.from_file(
+                filename, time_period=time_period
+            )
 
     def _initialise_simulations(self):
         self.baseline_simulation = self._initialise_simulation(
@@ -228,10 +258,9 @@ def _apply_region_to_simulation(
             elif "constituency/" in region:
                 constituency = region.split("/")[1]
                 constituency_names_file_path = download(
-                    repo="policyengine/policyengine-uk-data",
-                    repo_filename="constituencies_2024.csv",
-                    local_folder=None,
-                    version=None,
+                    huggingface_repo="policyengine-uk-data",
+                    gcs_bucket="policyengine-uk-data-private",
+                    filepath="constituencies_2024.csv",
                 )
                 constituency_names_file_path = Path(
                     constituency_names_file_path
@@ -250,10 +279,9 @@ def _apply_region_to_simulation(
                         f"Constituency {constituency} not found. See {constituency_names_file_path} for the list of available constituencies."
                     )
                 weights_file_path = download(
-                    repo="policyengine/policyengine-uk-data",
-                    repo_filename="parliamentary_constituency_weights.h5",
-                    local_folder=None,
-                    version=None,
+                    huggingface_repo="policyengine-uk-data",
+                    gcs_bucket="policyengine-uk-data-private",
+                    filepath="parliamentary_constituency_weights.h5",
                 )
 
                 with h5py.File(weights_file_path, "r") as f:
@@ -267,10 +295,9 @@ def _apply_region_to_simulation(
             elif "local_authority/" in region:
                 la = region.split("/")[1]
                 la_names_file_path = download(
-                    repo="policyengine/policyengine-uk-data",
-                    repo_filename="local_authorities_2021.csv",
-                    local_folder=None,
-                    version=None,
+                    huggingface_repo="policyengine-uk-data",
+                    gcs_bucket="policyengine-uk-data-private",
+                    filepath="local_authorities_2021.csv",
                 )
                 la_names_file_path = Path(la_names_file_path)
                 la_names = pd.read_csv(la_names_file_path)
@@ -283,10 +310,9 @@ def _apply_region_to_simulation(
                         f"Local authority {la} not found. See {la_names_file_path} for the list of available local authorities."
                     )
                 weights_file_path = download(
-                    repo="policyengine/policyengine-uk-data",
-                    repo_filename="local_authority_weights.h5",
-                    local_folder=None,
-                    version=None,
+                    huggingface_repo="policyengine-uk-data",
+                    gcs_bucket="policyengine-uk-data-private",
+                    filepath="local_authority_weights.h5",
                 )
 
                 with h5py.File(weights_file_path, "r") as f:
@@ -299,21 +325,3 @@ def _apply_region_to_simulation(
                 )
 
         return simulation
-
-    def _data_handle_cps_special_case(self):
-        """Handle special case for CPS data- this data doesn't specify time periods for each variable, but we still use it intensively."""
-        if self.data is not None and "cps_2023" in self.data:
-            if "hf://" in self.data:
-                owner, repo, filename = self.data.split("/")[-3:]
-                if "@" in filename:
-                    version = filename.split("@")[-1]
-                    filename = filename.split("@")[0]
-                else:
-                    version = None
-                self.data = download(
-                    repo=owner + "/" + repo,
-                    repo_filename=filename,
-                    local_folder=None,
-                    version=version,
-                )
-                self.data = Dataset.from_file(self.data, "2023")
diff --git a/policyengine/utils/data_download.py b/policyengine/utils/data_download.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+import logging
+import os
+from policyengine.utils.huggingface import download_from_hf
+from policyengine.utils.google_cloud_bucket import download_file_from_gcs
+from pydantic import BaseModel
+
+
+class DataFile(BaseModel):
+    filepath: str
+    huggingface_org: str
+    huggingface_repo: str | None = None
+    gcs_bucket: str | None = None
+
+
+def download(
+    filepath: str,
+    huggingface_repo: str = None,
+    gcs_bucket: str = None,
+    huggingface_org: str = "policyengine",
+):
+    data_file = DataFile(
+        filepath=filepath,
+        huggingface_org=huggingface_org,
+        huggingface_repo=huggingface_repo,
+        gcs_bucket=gcs_bucket,
+    )
+
+    logging.info = print
+    if Path(filepath).exists():
+        logging.info(f"File {filepath} already exists. Skipping download.")
+        return filepath
+
+    if data_file.huggingface_repo is not None:
+        logging.info("Using Hugging Face for download.")
+        try:
+            return download_from_hf(
+                repo=data_file.huggingface_org
+                + "/"
+                + data_file.huggingface_repo,
+                repo_filename=data_file.filepath,
+            )
+        except:
+            logging.info("Failed to download from Hugging Face.")
+
+    if data_file.gcs_bucket is not None:
+        logging.info("Using Google Cloud Storage for download.")
+        download_file_from_gcs(
+            bucket_name=data_file.gcs_bucket,
+            file_name=filepath,
+            destination_path=filepath,
+        )
+        return filepath
+
+    raise ValueError(
+        "No valid download method specified. Please provide either a Hugging Face repo or a Google Cloud Storage bucket."
+    )
diff --git a/policyengine/utils/google_cloud_bucket.py b/policyengine/utils/google_cloud_bucket.py
@@ -0,0 +1,29 @@
+def download_file_from_gcs(
+    bucket_name: str, file_name: str, destination_path: str
+) -> None:
+    """
+    Download a file from Google Cloud Storage to a local path.
+
+    Args:
+        bucket_name (str): The name of the GCS bucket.
+        file_name (str): The name of the file in the GCS bucket.
+        destination_path (str): The local path where the file will be saved.
+
+    Returns:
+        None
+    """
+    from google.cloud import storage
+
+    # Initialize a client
+    client = storage.Client()
+
+    # Get the bucket
+    bucket = client.bucket(bucket_name)
+
+    # Create a blob object from the file name
+    blob = bucket.blob(file_name)
+
+    # Download the file to a local path
+    blob.download_to_filename(destination_path)
+
+    return destination_path
diff --git a/policyengine/utils/huggingface.py b/policyengine/utils/huggingface.py
@@ -4,7 +4,7 @@
 import time
 
 
-def download(
+def download_from_hf(
     repo: str,
     repo_filename: str,
     local_folder: str | None = None,
diff --git a/policyengine/utils/maps.py b/policyengine/utils/maps.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import plotly.express as px
 import pandas as pd
-from policyengine.utils.huggingface import download
+from policyengine.utils.data_download import download
 import plotly.express as px
 from policyengine.utils.charts import *
 
@@ -10,16 +10,12 @@ def get_location_options_table(location_type: str) -> pd.DataFrame:
     if location_type == "parliamentary_constituencies":
         area_names_file_path = download(
             repo="policyengine/policyengine-uk-data",
-            repo_filename="constituencies_2024.csv",
-            local_folder=None,
-            version=None,
+            filepath="constituencies_2024.csv",
         )
     elif location_type == "local_authorities":
         area_names_file_path = download(
             repo="policyengine/policyengine-uk-data",
-            repo_filename="local_authorities_2021.csv",
-            local_folder=None,
-            version=None,
+            filepath="local_authorities_2021.csv",
         )
     df = pd.read_csv(area_names_file_path)
     return df
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "microdf_python",
     "getpass4",
     "pydantic",
+    "google-cloud-storage",
 ]
 
 [project.optional-dependencies]

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ dependencies = [`
`19`	`19`	`"microdf_python",`
`20`	`20`	`"getpass4",`
`21`	`21`	`"pydantic",`
	`22`	`+ "google-cloud-storage",`
`22`	`23`	`]`
`23`	`24`
`24`	`25`	`[project.optional-dependencies]`