diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index 7e6f8384..a15ae9fc 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -21,6 +21,10 @@ jobs: with: args: ". -l 79 --check" Test: + permissions: + contents: "read" + # Required to auth against gcp + id-token: "write" runs-on: larger-runner steps: - name: Checkout repo @@ -32,6 +36,10 @@ jobs: uses: actions/setup-python@v2 with: python-version: '3.11' + - uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider" + service_account: "policyengine-research@policyengine-research.iam.gserviceaccount.com" - name: Install package run: uv pip install -e .[dev] --system diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..32f49a87 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Upload to GCP on dataset build. diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py index 89efcaa6..72c59a3d 100644 --- a/policyengine_us_data/storage/upload_completed_datasets.py +++ b/policyengine_us_data/storage/upload_completed_datasets.py @@ -5,10 +5,20 @@ ) from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_us_data.utils.huggingface import upload +from google.cloud import storage def upload_datasets(): - for dataset in [EnhancedCPS_2024, Pooled_3_Year_CPS_2023, CPS_2023]: + storage_client = storage.Client() + bucket = storage_client.bucket("policyengine-us-data") + + datasets_to_upload = [ + EnhancedCPS_2024, + Pooled_3_Year_CPS_2023, + CPS_2023, + ] + + for dataset in datasets_to_upload: dataset = dataset() if not dataset.exists: raise ValueError( @@ -21,6 +31,13 @@ def upload_datasets(): dataset.file_path.name, ) + blob = dataset.file_path.name + blob = bucket.blob(blob) + blob.upload_from_filename(dataset.file_path) + print( + f"Uploaded {dataset.file_path.name} to GCS bucket policyengine-us-data." + ) + if __name__ == "__main__": upload_datasets() diff --git a/pyproject.toml b/pyproject.toml index 3ae3b244..3a41f362 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "microdf_python>=0.4.3", "microimpute", "pip-system-certs", + "google-cloud-storage", ] [project.optional-dependencies]