Store latest datasets cronjob #1032
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Store latest datasets cronjob | |
| on: | |
| workflow_dispatch: | |
| schedule: | |
| - cron: "0 0 * * 1,4" # every week on monday and thursday at 00:00 | |
| env: | |
| URL_STORAGE_PREFIX: "https://storage.googleapis.com/storage/v1/b/mdb-latest/o" | |
| jobs: | |
| get-urls: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Python 3.9 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: 3.9 | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install wheel numpy | |
| - name: Create URLs matrix with the direct download and latest URLs | |
| run: | | |
| python scripts/create_urls_matrix.py | |
| - name: Set URLs matrix | |
| id: set-matrix | |
| run: | | |
| DATASETS=$(jq . ./urls_matrix.json -c) | |
| echo "matrix=$DATASETS" >> $GITHUB_OUTPUT | |
| - name: Persist URLs matrix artifact | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: urls_matrix | |
| path: ./urls_matrix.json | |
| - name: Persist Get URLS report artifact | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: get_urls_report | |
| path: ./get_urls_report.txt | |
| outputs: | |
| matrix: ${{ steps.set-matrix.outputs.matrix }} | |
| store-datasets: | |
| needs: [ get-urls ] | |
| runs-on: ubuntu-latest | |
| strategy: | |
| matrix: ${{ fromJson(needs.get-urls.outputs.matrix) }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Python 3.9 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: 3.9 | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install wheel requests | |
| - name: Validate and download the datasets | |
| shell: python | |
| env: | |
| API_SOURCE_SECRETS: ${{ secrets.API_SOURCE_SECRETS }} | |
| run: | | |
| import os | |
| import json | |
| import requests | |
| from zipfile import ZipFile | |
| # OS constants | |
| ROOT = os.getcwd() | |
| DATASETS = "datasets" | |
| FOLDER = """${{ matrix.hash }}""" | |
| DOWNLOAD_REPORTS = "download_reports" | |
| # Jobs constants | |
| BASE = "base" | |
| DIRECT_DOWNLOAD = "direct_download" | |
| AUTHENTICATION_TYPE = "authentication_type" | |
| API_KEY_PARAMETER_NAME = "api_key_parameter_name" | |
| # Secrets constants | |
| API_SOURCE_SECRETS = "API_SOURCE_SECRETS" | |
| # Load API source secrets | |
| api_source_secrets = json.loads(os.environ[API_SOURCE_SECRETS]) | |
| jobs = """${{ matrix.data }}""".split() | |
| for job in jobs: | |
| print(job) | |
| try: | |
| job_json = json.loads(job) | |
| except Exception as e: | |
| print(e) | |
| exit(1) | |
| base = job_json[BASE] | |
| url = job_json[DIRECT_DOWNLOAD] | |
| authentication_type = job_json.get(AUTHENTICATION_TYPE) | |
| api_key_parameter_name = job_json.get(API_KEY_PARAMETER_NAME) | |
| api_key_parameter_value = api_source_secrets.get(base) | |
| # Download the dataset | |
| zip_path = os.path.join(ROOT, DATASETS, FOLDER, f"{base}.zip") | |
| os.makedirs(os.path.dirname(zip_path), exist_ok=True) | |
| params = {} | |
| headers = {} | |
| if authentication_type == 1: | |
| params[api_key_parameter_name] = api_key_parameter_value | |
| elif authentication_type == 2: | |
| headers[api_key_parameter_name] = api_key_parameter_value | |
| is_downloadable = True | |
| try: | |
| zip_file_req = requests.get(url, params=params, headers=headers, allow_redirects=True) | |
| zip_file_req.raise_for_status() | |
| except Exception as e: | |
| is_downloadable = False | |
| file_log = ( | |
| f"{base}: FAILURE! Exception {e} occurred when downloading URL {url}.\n" | |
| ) | |
| if is_downloadable: | |
| zip_file = zip_file_req.content | |
| with open(zip_path, "wb") as f: | |
| f.write(zip_file) | |
| # Make sure that the download file is a zip file | |
| try: | |
| ZipFile(zip_path, "r") | |
| file_log = ( | |
| f"{base}: SUCCESS! A GTFS dataset zip file was downloaded.\n" | |
| ) | |
| except Exception as e: | |
| os.remove(zip_path) | |
| file_log = ( | |
| f"{base}: FAILURE! Exception {e} occurred when loading the zip file.\n" | |
| ) | |
| report_path = os.path.join(ROOT, DOWNLOAD_REPORTS, f"{base}.txt") | |
| os.makedirs(os.path.dirname(report_path), exist_ok=True) | |
| with open(report_path, "w") as fp: | |
| fp.write(file_log) | |
| - name: Set up and authorize Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.ARCHIVE_DATASET_SA_KEY }} | |
| - name: Upload datasets to Google Cloud Storage | |
| id: upload-datasets | |
| uses: google-github-actions/[email protected] | |
| with: | |
| path: datasets/${{ matrix.hash }} | |
| destination: mdb-latest | |
| parent: false | |
| - name: Persist Download Reports artifact | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: download_reports_${{ matrix.hash }} | |
| path: download_reports | |
| merge-download-reports-artifacts: | |
| runs-on: ubuntu-latest | |
| needs: [ store-datasets ] | |
| steps: | |
| - name: Merge Artifacts Download Reports | |
| uses: actions/upload-artifact/merge@v4 | |
| with: | |
| name: download_reports_all | |
| pattern: download_reports_* | |
| delete-merged: true | |
| validate-latest: | |
| needs: [ get-urls, store-datasets ] | |
| runs-on: ubuntu-latest | |
| strategy: | |
| matrix: ${{ fromJson(needs.get-urls.outputs.matrix) }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Python 3.9 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: 3.9 | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install wheel requests | |
| - name: Validate the latest datasets | |
| shell: python | |
| run: | | |
| import os | |
| import json | |
| import requests | |
| from zipfile import ZipFile | |
| # OS constants | |
| ROOT = os.getcwd() | |
| DATASETS = "datasets" | |
| FOLDER = """${{ matrix.hash }}""" | |
| LATEST_REPORTS = "latest_reports" | |
| URL_STORAGE_PREFIX = """${{ env.URL_STORAGE_PREFIX }}""" | |
| # Jobs constants | |
| BASE = "base" | |
| LATEST = "latest" | |
| jobs = """${{ matrix.data }}""".split() | |
| for job in jobs: | |
| print(job) | |
| try: | |
| job_json = json.loads(job) | |
| except Exception as e: | |
| print(e) | |
| exit(1) | |
| base = job_json[BASE] | |
| url = URL_STORAGE_PREFIX + job_json[LATEST] | |
| # Download the dataset | |
| zip_path = os.path.join(ROOT, DATASETS, FOLDER, f"{base}.zip") | |
| os.makedirs(os.path.dirname(zip_path), exist_ok=True) | |
| is_downloadable = True | |
| try: | |
| zip_file_req = requests.get(url, allow_redirects=True) | |
| zip_file_req.raise_for_status() | |
| except Exception as e: | |
| is_downloadable = False | |
| file_log = ( | |
| f"{base}: FAILURE! Exception {e} occurred when downloading URL {url}.\n" | |
| ) | |
| if is_downloadable: | |
| zip_file = zip_file_req.content | |
| with open(zip_path, "wb") as f: | |
| f.write(zip_file) | |
| # Make sure that the download file is a zip file | |
| try: | |
| ZipFile(zip_path, "r") | |
| file_log = ( | |
| f"{base}: SUCCESS! A GTFS dataset zip file was downloaded.\n" | |
| ) | |
| except Exception as e: | |
| os.remove(zip_path) | |
| file_log = ( | |
| f"{base}: FAILURE! Exception {e} occurred when loading the zip file.\n" | |
| ) | |
| report_path = os.path.join(ROOT, LATEST_REPORTS, f"{base}.txt") | |
| os.makedirs(os.path.dirname(report_path), exist_ok=True) | |
| with open(report_path, "w") as fp: | |
| fp.write(file_log) | |
| - name: Persist Latest Reports artifact | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: latest_reports_${{ matrix.hash }} | |
| path: latest_reports | |
| merge-latest-reports-artifacts: | |
| runs-on: ubuntu-latest | |
| needs: [ validate-latest ] | |
| steps: | |
| - name: Merge Artifacts Latest Reports | |
| uses: actions/upload-artifact/merge@v4 | |
| with: | |
| name: latest_reports_all | |
| pattern: latest_reports_* | |
| delete-merged: true | |