Skip to content

Commit cd39fe1

Browse files
authored
fix: missing validation reports (#1102)
1 parent 46143a2 commit cd39fe1

File tree

21 files changed

+1161
-103
lines changed

21 files changed

+1161
-103
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ secrets-*.env
5858
vars.envx
5959
vars-*.env
6060

61-
config/.env.dev
61+
config/.env.*
6262
venv
6363
cypress.env.json
6464

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import os
2+
from typing import Final
3+
4+
GTFS_VALIDATOR_URL_PROD: Final[
5+
str
6+
] = "https://gtfs-validator-web-mbzoxaljzq-ue.a.run.app"
7+
GTFS_VALIDATOR_URL_STAGING: Final[
8+
str
9+
] = "https://stg-gtfs-validator-web-mbzoxaljzq-ue.a.run.app"
10+
11+
12+
def get_gtfs_validator_results_bucket(is_prod: bool) -> str:
13+
"""
14+
Get the GTFS validator results bucket name based on the environment.
15+
:param is_prod: true if target environment is production, false otherwise
16+
:return: the bucket name for the target environment
17+
"""
18+
if is_prod:
19+
return "gtfs-validator-results"
20+
else:
21+
return "stg-gtfs-validator-results"
22+
23+
24+
def get_gtfs_validator_url(is_prod: bool) -> str:
25+
"""
26+
Get the GTFS validator URL based on the environment
27+
or GTFS_VALIDATOR_URL_PROD/GTFS_VALIDATOR_URL_STAGING environment dependent.
28+
:param is_prod: true if target environment is production if the env variable is found this parameter is ignored
29+
, false otherwise
30+
:return: the GTFS validator URL for the target environment
31+
"""
32+
# Look up the GTFS validator URL based on the environment
33+
result = os.getenv("GTFS_VALIDATOR_URL")
34+
if result:
35+
return result
36+
if is_prod:
37+
return GTFS_VALIDATOR_URL_PROD
38+
else:
39+
return GTFS_VALIDATOR_URL_STAGING

functions-python/helpers/query_helper.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
import logging
2+
from datetime import datetime
23
from typing import Type
34

45
from shared.database_gen.sqlacodegen_models import (
56
Feed,
67
Gtfsrealtimefeed,
78
Gtfsfeed,
89
Gbfsfeed,
10+
Gtfsdataset,
11+
Validationreport,
912
)
1013
from sqlalchemy import and_
1114
from sqlalchemy.orm import Session, joinedload
@@ -133,3 +136,35 @@ def get_feeds_query(
133136
except Exception as e:
134137
logging.error("Error building query: %s", str(e))
135138
raise
139+
140+
141+
def get_datasets_with_missing_reports_query(
142+
db_session: Session,
143+
filter_after: datetime | None = None,
144+
) -> Query:
145+
"""
146+
Get datasets with missing validation reports.
147+
148+
Args:
149+
db_session: SQLAlchemy session
150+
filter_after: Optional date to filter datasets
151+
152+
Returns:
153+
A SQLAlchemy query object for datasets with missing validation reports order by feed and dataset stable id.
154+
"""
155+
query = (
156+
db_session.query(
157+
Gtfsfeed.stable_id,
158+
Gtfsdataset.stable_id,
159+
)
160+
.select_from(Gtfsfeed)
161+
.join(Gtfsdataset, Gtfsdataset.feed_id == Gtfsfeed.id)
162+
.outerjoin(Validationreport, Gtfsdataset.validation_reports)
163+
.filter(Validationreport.id.is_(None))
164+
)
165+
if filter_after:
166+
query = query.filter(Gtfsdataset.downloaded_at >= filter_after)
167+
query = query.distinct(Gtfsfeed.stable_id, Gtfsdataset.stable_id).order_by(
168+
Gtfsdataset.stable_id, Gtfsfeed.stable_id
169+
)
170+
return query
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import os
2+
import logging
3+
import json
4+
from time import sleep
5+
6+
from google.cloud import workflows_v1
7+
from google.cloud.workflows import executions_v1
8+
from google.cloud.workflows.executions_v1 import Execution
9+
10+
env = os.getenv("ENV", "dev").lower()
11+
bucket_name = f"mobilitydata-datasets-{env}"
12+
13+
14+
def execute_workflow(
15+
project: str,
16+
location: str = "northamerica-northeast1",
17+
workflow: str = "gtfs_validator_execution",
18+
input_data: dict = None,
19+
) -> Execution:
20+
"""
21+
Executes a workflow with input data and print the execution results.
22+
@param project: The Google Cloud project id which contains the workflow to execute.
23+
@param location: The location for the workflow.
24+
@param workflow: The ID of the workflow to execute.
25+
@param input_data: A dictionary containing input data for the workflow.
26+
@return: The execution response.
27+
"""
28+
execution_client = executions_v1.ExecutionsClient()
29+
workflows_client = workflows_v1.WorkflowsClient()
30+
parent = workflows_client.workflow_path(project, location, workflow)
31+
32+
# Prepare the execution input as a JSON string.
33+
input_json = json.dumps(input_data) if input_data else "{}"
34+
35+
# Create and configure the execution request with input data.
36+
execution_request = Execution(argument=input_json)
37+
response = execution_client.create_execution(
38+
parent=parent, execution=execution_request
39+
)
40+
logging.info(f"Created execution: {response.name}")
41+
execution = execution_client.get_execution(request={"name": response.name})
42+
return execution
43+
44+
45+
def execute_workflows(
46+
latest_datasets,
47+
validator_endpoint=None,
48+
bypass_db_update=False,
49+
reports_bucket_name=None,
50+
):
51+
"""
52+
Execute the workflow for the latest datasets that need their validation report to be updated
53+
:param latest_datasets: List of tuples containing the feed stable id and dataset stable id
54+
:param validator_endpoint: The URL of the validator
55+
:param bypass_db_update: Whether to bypass the database update
56+
:param reports_bucket_name: The name of the bucket where the reports are stored
57+
:return: List of dataset stable ids for which the workflow was executed
58+
"""
59+
project_id = f"mobility-feeds-{env}"
60+
location = os.getenv("LOCATION", "northamerica-northeast1")
61+
execution_triggered_datasets = []
62+
batch_size = int(os.getenv("BATCH_SIZE", 5))
63+
sleep_time = int(os.getenv("SLEEP_TIME", 5))
64+
count = 0
65+
logging.info(f"Executing workflow for {len(latest_datasets)} datasets")
66+
for feed_id, dataset_id in latest_datasets:
67+
try:
68+
input_data = {
69+
"data": {
70+
"bypass_db_update": bypass_db_update,
71+
"protoPayload": {
72+
"resourceName": "projects/_/"
73+
f"buckets/{bucket_name}/"
74+
f"objects/{feed_id}/{dataset_id}/{dataset_id}.zip"
75+
},
76+
"resource": {
77+
"labels": {"location": location, "project_id": project_id},
78+
},
79+
}
80+
}
81+
if validator_endpoint:
82+
input_data["data"]["validator_endpoint"] = validator_endpoint
83+
if reports_bucket_name:
84+
input_data["data"]["reports_bucket_name"] = reports_bucket_name
85+
logging.info(f"Executing workflow for {feed_id}/{dataset_id}")
86+
execute_workflow(project_id, input_data=input_data)
87+
execution_triggered_datasets.append(dataset_id)
88+
except Exception as e:
89+
logging.error(
90+
f"Error while executing workflow for {feed_id}/{dataset_id}: {e}"
91+
)
92+
count += 1
93+
logging.info(f"Triggered workflow execution for {count} datasets")
94+
if count % batch_size == 0:
95+
logging.info(
96+
f"Sleeping for {sleep_time} seconds before next batch to avoid rate limiting.."
97+
)
98+
sleep(sleep_time)
99+
return execution_triggered_datasets
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[run]
2+
omit =
3+
*/test*/*
4+
*/helpers/*
5+
*/database_gen/*
6+
*/shared/*
7+
8+
[report]
9+
exclude_lines =
10+
if __name__ == .__main__.:
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Tasks Executor
2+
3+
This directory contains Google Cloud Functions used as a single point of access to multiple _tasks_.
4+
5+
## Usage
6+
The function receive the following payload:
7+
```
8+
{
9+
"task": "string", # [required] Name of the task to execute
10+
"payload": { } [optional] Payload to pass to the task
11+
}
12+
"payload": {
13+
"dry_run": true,
14+
"filter_after_in_days": 14,
15+
"filter_statuses": ["active", "inactive", "future"]
16+
}
17+
}
18+
```
19+
To get the list of supported tasks use:
20+
``
21+
{
22+
"name": "list_tasks",
23+
"payload": {}
24+
}
25+
`````
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"name": "tasks_executor",
3+
"description": "The Tasks Executor function runs maintenance tasks avoiding the creation of multiple functions for one-time execution",
4+
"entry_point": "tasks_executor",
5+
"timeout": 540,
6+
"memory": "4Gi",
7+
"trigger_http": false,
8+
"include_folders": ["helpers"],
9+
"include_api_folders": ["database_gen", "database", "common"],
10+
"environment_variables": [],
11+
"secret_environment_variables": [
12+
{
13+
"key": "FEEDS_DATABASE_URL"
14+
}
15+
],
16+
"ingress_settings": "ALLOW_ALL",
17+
"max_instance_request_concurrency": 1,
18+
"max_instance_count": 1,
19+
"min_instance_count": 0,
20+
"available_cpu": 1
21+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Code to be able to debug locally without affecting the runtime cloud function
2+
3+
#
4+
# Requirements:
5+
# - Google Cloud SDK installed
6+
# - Make sure to have the following environment variables set in your .env.local file
7+
# - Local database in running state
8+
9+
# Usage:
10+
# - python tasks_executor/main_local_debug.py
11+
# - This can be easily run/debug in a local IDE like PyCharm or VSCode
12+
13+
import flask
14+
from flask.testing import EnvironBuilder
15+
16+
from main import tasks_executor
17+
18+
# Create a Flask app instance
19+
app = flask.Flask(__name__)
20+
21+
if __name__ == "__main__":
22+
# Create a mock payload
23+
payload = {"task": "list_tasks"}
24+
25+
# Push the application context
26+
with app.app_context():
27+
# Build a mock request environment
28+
builder = EnvironBuilder(app=app, method="POST", path="/", json=payload)
29+
env = builder.get_environ()
30+
31+
# Create a Flask request object
32+
mock_request = flask.Request(env)
33+
34+
# Call the tasks_executor function with the mock request
35+
response = tasks_executor(mock_request)
36+
37+
# If the response is a tuple, extract the response object
38+
if isinstance(response, tuple):
39+
response, _ = response
40+
41+
# Print the response data
42+
print(response.get_data(as_text=True))
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Common packages
2+
functions-framework==3.*
3+
google-cloud-logging
4+
psycopg2-binary==2.9.6
5+
aiohttp~=3.10.5
6+
asyncio~=3.4.3
7+
urllib3~=2.2.2
8+
requests~=2.32.3
9+
attrs~=23.1.0
10+
pluggy~=1.3.0
11+
certifi~=2024.7.4
12+
13+
# SQL Alchemy and Geo Alchemy
14+
SQLAlchemy==2.0.23
15+
geoalchemy2==0.14.7
16+
17+
# Google specific packages for this function
18+
google-cloud-workflows
19+
flask
20+
21+
# Configuration
22+
python-dotenv==1.0.0
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Faker
2+
pytest~=7.4.3
3+
urllib3-mock
4+
requests-mock

0 commit comments

Comments
 (0)